Commit da92cb38 by Enrico Pozzobon

Merge branch 'email-submissions'

parents 90acf8b3 f9e2581f
This source diff could not be displayed because it is too large. You can view the blob instead.
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
#include "skinny128.h"
#include "romulus.h"
#include <string.h>
u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
u8 domain = 0;
u32 leftover;
//Determine which domain bits we need based on the length of the ad
if (adlen == 0) {
domain ^= 0x02; // No message, so only 1 block with padding
} else {
leftover = (u32)(adlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd ad length?
domain ^= 0x08; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x02; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x0A; // Even with a partial double block at the end
}
}
//Determine which domain bits we need based on the length of the message
if (mlen == 0) {
domain ^= 0x01; // No message, so only 1 block with padding
} else {
leftover = (u32)(mlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd message length?
domain ^= 0x04; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x01; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x05; // Even with a partial double block at the end
}
}
return domain;
}
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 tmp_mlen = mlen;
u32 tmp;
const u8* m_auth = m;
u8 final_domain = 0x30;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
final_domain ^= final_ad_domain(adlen, mlen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (tmp_mlen >= BLOCKBYTES) {
tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
if (tmp_mlen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
m_auth += BLOCKBYTES;
tmp_mlen -= BLOCKBYTES;
} else {
memcpy(pad, m_auth, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
tmp_mlen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (tmp_mlen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
tmp_mlen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (tmp_mlen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (tmp_mlen > BLOCKBYTES) { // Last message double block is partial
tmp_mlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (tmp_mlen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (tmp_mlen > 0) { // Last message single block is partial
for(int i =0; i < (int)tmp_mlen; i++)
state[i] ^= m_auth[i];
state[15] ^= (u8)tmp_mlen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
// ----------------- Process the associated data -----------------
// ----------------- Generate the tag -----------------
G(state,state);
memcpy(c + mlen, state, TAGBYTES);
// ----------------- Generate the tag -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (mlen > 0) {
SET_DOMAIN(tks, 0x24);
while (mlen > BLOCKBYTES) {
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
for(int i = 0; i < (int)mlen; i++) {
tmp = m[i]; // Use of tmp variable in case c = m
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; // Padding
}
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp, tmp_mlen;
u8 final_domain = 0x30;
u8* m_auth = m;
const u8* c_tmp = c;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memcpy(state, c + clen, TAGBYTES);
tmp_mlen = clen;
if (tmp_mlen > 0) {
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
SET_DOMAIN(tks, 0x24);
while (tmp_mlen > BLOCKBYTES) {
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
RHO_INV(state, c, m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
tmp_mlen -= BLOCKBYTES;
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
for(int i = 0; i < (int)tmp_mlen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)tmp_mlen; // Padding
}
// ----------------- Process the ciphertext -----------------
// ----------------- Process the associated data -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
memset(state, 0x00, BLOCKBYTES);
final_domain ^= final_ad_domain(adlen, clen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (clen >= BLOCKBYTES) {
tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
if (clen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
m_auth += BLOCKBYTES;
clen -= BLOCKBYTES;
} else {
memcpy(pad, m_auth, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
clen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (clen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
clen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (clen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (clen > BLOCKBYTES) { // Last message double block is partial
clen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (clen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (clen > 0) { // Last message single block is partial
for(int i =0; i < (int)clen; i++)
state[i] ^= m[i];
state[15] ^= (u8)clen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
// ----------------- Process the associated data -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(int i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c_tmp[*mlen+i]; //constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 40
extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
/*******************************************************************************
* ARM assembly implementation of fixsliced SKINNY-128-384.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
.syntax unified
.thumb
/*******************************************************************************
* applies P^2 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p2:
movw r1, #0xcc00
movt r1, #0xcc00 //r1 <- 0xcc00cc00
movw r10, #0xcc00
movt r10, #0x0033 //r10<- 0xcc000033
and r11, r1, r6, ror #14
bfi r11, r6, #16, #8
and r12, r6, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r6
orr r11, r11, r12, lsr #8
and r12, r6, #0x00cc0000
orr r6, r11, r12, lsr #18
and r11, r1, r7, ror #14
bfi r11, r7, #16, #8
and r12, r7, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r7
orr r11, r11, r12, lsr #8
and r12, r7, #0x00cc0000
orr r7, r11, r12, lsr #18
and r11, r1, r8, ror #14
bfi r11, r8, #16, #8
and r12, r8, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r8
orr r11, r11, r12, lsr #8
and r12, r8, #0x00cc0000
orr r8, r11, r12, lsr #18
and r11, r1, r9, ror #14
bfi r11, r9, #16, #8
and r12, r9, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r9
orr r11, r11, r12, lsr #8
and r12, r9, #0x00cc0000
orr r9, r11, r12, lsr #18
bx lr
/*******************************************************************************
* applies P^4 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p4:
str.w r14, [sp] //store r14 on the stack
movw r14, #0x00cc
movt r14, #0xcc00 //r14<- 0xcc0000cc
movw r12, #0xcc00
movt r12, #0x3300 //r12<- 0x3300cc00
movw r11, #0x00cc
movt r11, #0x00cc //r11<- 0x00cc00cc
and r10, r14, r6, ror #22
and r1, r12, r6, ror #16
orr r10, r10, r1
and r1, r6, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r6, r6, r1
orr r6, r10, r6, ror #24
and r10, r14, r7, ror #22
and r1, r12, r7, ror #16
orr r10, r10, r1
and r1, r7, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r7, r7, r1
orr r7, r10, r7, ror #24
and r10, r14, r8, ror #22
and r1, r12, r8, ror #16
orr r10, r10, r1
and r1, r8, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r8, r8, r1
orr r8, r10, r8, ror #24
and r10, r14, r9, ror #22
ldr.w r14, [sp] //restore r14
and r12, r12, r9, ror #16
orr r10, r10, r12
and r12, r9, r11
orr r10, r10, r12, lsr #2
movw r12, #0xcc33 //r1 <- 0x0000cc33
and r9, r9, r12
orr r9, r10, r9, ror #24
bx lr
/*******************************************************************************
* applies P^6 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p6:
movw r1, #0x3333 //r1 <- 0x00003333
movw r12, #0x00cc
movt r12, #0x3300 //r12<- 0x330000cc
and r10, r6, r1, ror #8 // --- permute r6 6 times
and r11, r12, r6, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r6, ror #10
orr r11, r11, r10
and r10, r6, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r6, #0x00003300
orr r6, r11, r10, lsl #2 // permute r6 6 times ---
and r10, r7, r1, ror #8 // --- permute r7 6 times
and r11, r12, r7, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r7, ror #10
orr r11, r11, r10
and r10, r7, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r7, #0x00003300
orr r7, r11, r10, lsl #2 // permute r7 6 times ---
and r10, r8, r1, ror #8 // --- permute r8 6 times
and r11, r12, r8, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r8, ror #10
orr r11, r11, r10
and r10, r8, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r8, #0x00003300
orr r8, r11, r10, lsl #2 // permute r8 6 times ---
and r10, r9, r1, ror #8 // --- permute r9 6 times
and r11, r12, r9, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r9, ror #10
orr r11, r11, r10
and r10, r9, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r9, #0x00003300 // permute r9 6 times ---
orr r9, r11, r10, lsl #2
bx lr
/*******************************************************************************
* applies P^8 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p8:
movw r12, #0x3333 //r12<- 0x00003333
movw r1, #0x0000
movt r1, #0x33cc //r1 <- 0x33cc0000
and r10, r6, r1 // --- permute r6 8 times
and r11, r1, r6, ror #8
orr r11, r11, r10, ror #24
and r10, r6, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r6, r12, lsl #8
orr r6, r11, r10, lsr #6 // permute r6 8 times ---
and r10, r7, r1 // --- permute r7 8 times
and r11, r1, r7, ror #8
orr r11, r11, r10, ror #24
and r10, r7, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r7, r12, lsl #8
orr r7, r11, r10, lsr #6 // permute r7 8 times ---
and r10, r8, r1 // --- permute r8 8 times
and r11, r1, r8, ror #8
orr r11, r11, r10, ror #24
and r10, r8, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r8, r12, lsl #8
orr r8, r11, r10, lsr #6 // permute r8 8 times ---
and r10, r9, r1 // --- permute r9 8 times
and r11, r1, r9, ror #8
orr r11, r11, r10, ror #24
and r10, r9, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r9, r12, lsl #8
orr r9, r11, r10, lsr #6 // permute r9 8 times ---
bx lr
/*******************************************************************************
* applies P^10 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p10:
movw r12, #0x0033
movt r12, #0x3300 //r12<- 0x33000033
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r10, r6, r1, ror #8 // --- permute r6 10 times
and r11, r12, r6, ror #26
orr r11, r11, r10, ror #8
and r10, r6, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r6, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r6, #0x0000cc00
orr r6, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r7, r1, ror #8 // --- permute r6 10 times
and r11, r12, r7, ror #26
orr r11, r11, r10, ror #8
and r10, r7, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r7, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r7, #0x0000cc00
orr r7, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r8, r1, ror #8 // --- permute r6 10 times
and r11, r12, r8, ror #26
orr r11, r11, r10, ror #8
and r10, r8, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r8, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r8, #0x0000cc00
orr r8, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r9, r1, ror #8 // --- permute r6 10 times
and r11, r12, r9, ror #26
orr r11, r11, r10, ror #8
and r10, r9, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r9, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r9, #0x0000cc00
orr r9, r11, r10, lsr #2 // permute r6 10 times ---
bx lr
/*******************************************************************************
* applies P^12 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p12:
str.w r14, [sp] //store r14 on the stack
movw r14, #0xcc33 //r14<- 0x0000cc33
movw r12, #0x00cc
movt r12, #0x00cc //r12<- 0x00cc00cc
movw r1, #0x3300
movt r1, #0xcc00 //r1 <- 0xcc003300
and r10, r14, r6, ror #8 // --- permute r6 12 times
and r11, r12, r6, ror #30
orr r11, r11, r10
and r10, r1, r6, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r6, r10, ror #8
orr r6, r11, r10, ror #10 // permute r6 12 times ---
and r10, r14, r7, ror #8 // --- permute r7 12 times
and r11, r12, r7, ror #30
orr r11, r11, r10
and r10, r1, r7, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r7, r10, ror #8
orr r7, r11, r10, ror #10 // permute r7 12 times ---
and r10, r14, r8, ror #8 // --- permute r8 12 times
and r11, r12, r8, ror #30
orr r11, r11, r10
and r10, r1, r8, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r8, r10, ror #8
orr r8, r11, r10, ror #10 // permute r8 12 times ---
and r10, r14, r9, ror #8 // --- permute r9 12 times
and r11, r12, r9, ror #30
orr r11, r11, r10
and r10, r1, r9, ror #16
ldr.w r14, [sp]
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r9, r10, ror #8
orr r9, r11, r10, ror #10 // permute r9 12 times ---
bx lr
/*******************************************************************************
* applies P^14 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p14:
movw r1, #0xcc00
movt r1, #0x0033 //r1 <- 0x0033cc00
movw r12, #0xcc00
movt r12, #0xcc00 //r12<- 0x33003300
and r10, r1, r6, ror #24 // --- permute r6 14 times
and r11, r6, #0x00000033
orr r11, r10, r11, ror #14
and r10, r6, #0x33000000
orr r11, r11, r10, ror #30
and r10, r6, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r6, r12
orr r6, r11, r10, ror #18 // permute r6 14 times ---
and r10, r1, r7, ror #24 // --- permute r7 14 times
and r11, r7, #0x00000033
orr r11, r10, r11, ror #14
and r10, r7, #0x33000000
orr r11, r11, r10, ror #30
and r10, r7, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r7, r12
orr r7, r11, r10, ror #18 // permute r7 14 times ---
and r10, r1, r8, ror #24 // --- permute r8 14 times
and r11, r8, #0x00000033
orr r11, r10, r11, ror #14
and r10, r8, #0x33000000
orr r11, r11, r10, ror #30
and r10, r8, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r8, r12
orr r8, r11, r10, ror #18 // permute r8 14 times ---
and r10, r1, r9, ror #24 // --- permute r9 14 times
and r11, r9, #0x00000033
orr r11, r10, r11, ror #14
and r10, r9, #0x33000000
orr r11, r11, r10, ror #30
and r10, r9, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r9, r12
orr r9, r11, r10, ror #18 // permute r9 14 times ---
bx lr
.align 2
packing:
eor r12, r2, r2, lsr #3
and r12, r12, r10
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r10
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r10
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r10
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r11
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r11, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r11, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r11, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r11, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r11, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
bx lr
/******************************************************************************
* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
* Performing both at the same time allows to save some memory accesses.
******************************************************************************/
@ void tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
.global tkschedule_lfsr
.type tkschedule_lfsr,%function
.align 2
tkschedule_lfsr:
push {r0-r12, r14}
ldr.w r3, [r1, #8] //load tk2 (3rd word)
ldr.w r4, [r1, #4] //load tk2 (2nd word)
ldr.w r5, [r1, #12] //load tk2 (4th word)
ldr.w r12, [r1] //load tk2 (1st word)
mov r1, r2 //move tk3 address in r1
mov r2, r12 //move 1st tk2 word in r2
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk2
mov r6, r2 //move tk2 from r2-r5 to r6-r9
mov r7, r3 //move tk2 from r2-r5 to r6-r9
mov r8, r4 //move tk2 from r2-r5 to r6-r9
mov r9, r5 //move tk2 from r2-r5 to r6-r9
ldr.w r3, [r1, #8] //load tk3 (3rd word)
ldr.w r4, [r1, #4] //load tk3 (2nd word)
ldr.w r5, [r1, #12] //load tk3 (4th) word)
ldr.w r2, [r1] //load tk3 (1st) word)
bl packing //pack tk3
eor r10, r10, r10, lsl #4 //r10<- 0xaaaaaaaa
ldr.w r1, [sp, #12] //load loop counter in r1
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #8 //store in tk
loop:
and r12, r8, r10 // --- apply LFSR2 to tk2
eor r12, r12, r6
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r6, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r3, r10 // --- apply LFSR3 to tk3
eor r12, r5, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r5, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r5, r7 //tk2 ^ tk3 (1st word)
eor r12, r2, r8 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r3, r9 //tk2 ^ tk3 (3rd word)
eor r12, r4, r6 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r9, r10 // --- apply LFSR2 to tk2
eor r12, r12, r7
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r7, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r2, r10 // --- apply LFSR3 to tk3
eor r12, r4, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r4, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r4, r8 //tk2 ^ tk3 (1st word)
eor r12, r5, r9 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r2, r6 //tk2 ^ tk3 (3rd word)
eor r12, r3, r7 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r6, r10 // --- apply LFSR2 to tk2
eor r12, r12, r8
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r8, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r5, r10 // --- apply LFSR3 to tk3
eor r12, r3, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r3, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r3, r9 //tk2 ^ tk3 (1st word)
eor r12, r4, r6 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r5, r7 //tk2 ^ tk3 (3rd word)
eor r12, r2, r8 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r7, r10 // --- apply LFSR2 to tk2
eor r12, r12, r9
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r9, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r4, r10 // --- apply LFSR3 to tk3
eor r12, r2, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r2, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
subs.w r1, r1, #8 //decrease loop counter by 8
bne loop
pop {r0-r12, r14}
bx lr
@ void tkschedule_perm(u32* tk)
.global tkschedule_perm
.type tkschedule_perm,%function
.align 2
tkschedule_perm:
push {r0-r12, lr}
sub.w sp, #4 //to store r14 in subroutines
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r6, r6, r10 //tk &= 0xf0f0f0f0 (1st word)
and r7, r7, r10 //tk &= 0xf0f0f0f0 (2nd word)
and r8, r8, r10 //tk &= 0xf0f0f0f0 (3rd word)
and r9, r9, r10 //tk &= 0xf0f0f0f0 (4th word)
eor r8, r8, #0x00000004 //add rconst
eor r9, r9, #0x00000040 //add rconst
mvn r9, r9 //to remove a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 1st round
strd r6, r7, [r0], #8 //store 2nd half tk for 1st round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 3rd round
strd r6, r7, [r0], #8 //store 2nd half tk for 3rd round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 5th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 5th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 4th round
strd r8, r9, [r0], #24 //store 2nd half tk for 4th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 7th round
strd r6, r7, [r0], #8 //store 2nd half tk for 7th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 9th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000054 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 9th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000410
eor r9, r9, #0x00000410
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 8th round
strd r8, r9, [r0], #24 //store 2nd half tk for 8th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r6, r6, #0x04000000 //add rconst
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 11th round
strd r6, r7, [r0], #8 //store 2nd half tk for 11th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 13th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00140000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 13th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 12th round
strd r8, r9, [r0], #24 //store 2nd half tk for 12th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
eor r9, r9, #0x00000400 //add const
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 15th round
strd r6, r7, [r0], #8 //store 2nd half tk for 15th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 17th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 17th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 16th round
strd r8, r9, [r0], #24 //store 2nd half tk for 16th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 18th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 18th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
eor r9, r9, #0x04000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 19th round
strd r6, r7, [r0], #8 //store 2nd half tk for 19th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 21th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00100000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 21th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x04100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 20th round
strd r8, r9, [r0], #24 //store 2nd half tk for 20th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 22th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 22th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 23th round
strd r6, r7, [r0], #8 //store 2nd half tk for 23th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 25th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 25th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 24th round
strd r8, r9, [r0], #24 //store 2nd half tk for 24th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 26th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 26th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 27th round
strd r6, r7, [r0], #8 //store 2nd half tk for 27th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 29th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 29th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 28th round
strd r8, r9, [r0], #24 //store 2nd half tk for 28th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 30th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 30th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 31th round
strd r6, r7, [r0], #8 //store 2nd half tk for 31th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 33th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 33th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 32th round
strd r8, r9, [r0], #24 //store 2nd half tk for 32th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 34th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 34th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 35th round
strd r6, r7, [r0], #8 //store 2nd half tk for 35th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 37th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 37th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 36th round
strd r8, r9, [r0], #24 //store 2nd half tk for 36th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 38th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 38th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 39th round
strd r6, r7, [r0], #8 //store 2nd half tk for 39th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0x3030
movt r10, #0x3030 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 40th round
strd r8, r9, [r0] //store 2nd half tk for 40th round
add.w sp, #4
pop {r0-r12, lr}
bx lr
/******************************************************************************
* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
* don't need more calculations as no LFSR is applied to TK1.
******************************************************************************/
@ void tkschedule_perm_tk1(u32* tk, const u8* key)
.global tkschedule_perm_tk1
.type tkschedule_perm_tk1,%function
.align 2
tkschedule_perm_tk1:
push {r0-r12, lr}
ldr.w r3, [r1, #8] //load tk1 (3rd word)
ldr.w r4, [r1, #4] //load tk1 (2nd word)
ldr.w r5, [r1, #12] //load tk1 (4th word)
ldr.w r2, [r1] //load tk1 (1st word)
movw r10, #0x0a0a
movt r10, #0x0a0a //r6 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk1
mov r6, r2 //move tk1 from r2-r5 to r6-r9
mov r7, r3 //move tk1 from r2-r5 to r6-r9
mov r8, r4 //move tk1 from r2-r5 to r6-r9
mov r9, r5 //move tk1 from r2-r5 to r6-r9
movw r2, #0xf0f0
movt r2, #0xf0f0 //r2<- 0xf0f0f0f0
and r11, r8, r2 //tk &= 0xf0f0f0f0 (3rd word)
and r12, r9, r2 //tk &= 0xf0f0f0f0 (4th word)
strd r11, r12, [r0], #8 //store 1st half tk for 1st round
and r11, r6, r2 //tk &= 0xf0f0f0f0 (1st word)
and r12, r7, r2 //tk &= 0xf0f0f0f0 (2nd word)
strd r11, r12, [r0], #8 //store 2nd half tk for 1st round
bl p2 //apply the permutation twice
movw r3, #0x0303
movt r3, #0x0303 //r3<- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 4 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 6 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 8 times
and r11, r2, r6 //ror and mask to match fixslicing
and r12, r2, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 9th round
and r11, r2, r8 //ror and mask to match fixslicing
and r12, r2, r9 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 9th round
bl p2 //apply the permutation 10
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 12 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 14 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0] //ror and masks to match fixslicing ---
pop {r0-r12, lr}
bx lr
.align 2
quadruple_round:
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #30 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #24
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #30
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #2
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #10 // --- mixcolumns 2 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #26
eor r2, r2, r8
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #26
eor r5, r5, r8
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #26
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #22
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
bx lr
/******************************************************************************
* Encrypt a single block using fixsliced SKINNY-128-128.
******************************************************************************/
@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext)
.global skinny128_384
.type skinny128_384,%function
.align 2
skinny128_384:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
movw r7, #0x3030
movt r7, #0x3030 //r7 <- 0x30303030
eor r12, r2, r2, lsr #3
and r12, r12, r6
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r6
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r6
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r6
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r7
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r7, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r7, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r7, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r7, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r7, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
eor r10, r3, r5, lsr #2
and r10, r10, r7, lsr #4
eor r3, r3, r10
eor r5, r5, r10, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
eor r10, r4, r5, lsr #4
and r10, r10, r7, lsr #4
eor r4, r4, r10
eor r5, r5, r10, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r10, r4, r3, lsr #2
and r10, r10, r7, lsr #2
eor r4, r4, r10
eor r3, r3, r10, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r10, r2, r5, lsr #6
and r10, r10, r7, lsr #4
eor r2, r2, r10
eor r5, r5, r10, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r10, r2, r3, lsr #4
and r10, r10, r7, lsr #2
eor r2, r2, r10
eor r3, r3, r10, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r10, r2, r4, lsr #2
and r10, r10, r7
eor r2, r2, r10
eor r4, r4, r10, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r10, r5, r5, lsr #3
and r10, r10, r6
eor r5, r5, r10
eor r5, r5, r10, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r10, r4, r4, lsr #3
and r10, r10, r6
eor r4, r4, r10
eor r4, r4, r10, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r10, r3, r3, lsr #3
and r10, r10, r6
eor r3, r3, r10
eor r3, r3, r10, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r10, r2, r2, lsr #3
and r10, r10, r6
eor r2, r2, r10
eor r2, r2, r10, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
\ No newline at end of file
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
#include "skinny128.h"
#include "tk_schedule.h"
#include "romulus.h"
#include <string.h>
#include <stdio.h>
static u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
u8 domain = 0;
u32 leftover;
//Determine which domain bits we need based on the length of the ad
if (adlen == 0) {
domain ^= 0x02; // No message, so only 1 block with padding
} else {
leftover = (u32)(adlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd ad length?
domain ^= 0x08; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x02; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x0A; // Even with a partial double block at the end
}
}
//Determine which domain bits we need based on the length of the message
if (mlen == 0) {
domain ^= 0x01; // No message, so only 1 block with padding
} else {
leftover = (unsigned)(mlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd message length?
domain ^= 0x04; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x01; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x05; // Even with a partial double block at the end
}
}
return domain;
}
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
u64 tmp_mlen = mlen;
const u8* m_auth = m;
u8 final_domain = 0x30;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
final_domain ^= final_ad_domain(adlen, mlen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (tmp_mlen >= BLOCKBYTES) {
precompute_rtk2_3(tks.rtk2_3, m_auth, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
if (tmp_mlen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
tmp_mlen -= BLOCKBYTES;
m_auth += BLOCKBYTES;
} else {
memcpy(pad, m_auth, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
tmp_mlen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (tmp_mlen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
tmp_mlen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (tmp_mlen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else if (tmp_mlen > BLOCKBYTES) { // Last message double block is partial
tmp_mlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else if (tmp_mlen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (tmp_mlen > 0) { // Last message single block is partial
for(int i =0; i < (int)tmp_mlen; i++)
state[i] ^= m_auth[i];
state[15] ^= (u8)tmp_mlen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
// ----------------- Process the associated data -----------------
// ----------------- Generate the tag -----------------
G(state,state);
memcpy(c + mlen, state, TAGBYTES);
// ----------------- Generate the tag -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (mlen > 0) {
SET_DOMAIN(tks, 0x24);
while (mlen > BLOCKBYTES) {
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
for(int i = 0; i < (int)mlen; i++) {
tmp = m[i]; // Use of tmp variable in case c = m
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; // Padding
}
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
u64 tmp_mlen;
u8 final_domain = 0x30;
u8* m_auth = m;
const u8* c_tmp = c;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memcpy(state, c + clen, TAGBYTES);
tmp_mlen = clen;
if (tmp_mlen > 0) {
SET_DOMAIN(tks, 0x24);
precompute_rtk2_3(tks.rtk2_3, npub, k);
while (tmp_mlen > BLOCKBYTES) {
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
RHO_INV(state, c, m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
tmp_mlen -= BLOCKBYTES;
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
for(int i = 0; i < (int)tmp_mlen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)tmp_mlen; // Padding
}
// ----------------- Process the ciphertext -----------------
// ----------------- Process the associated data -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
memset(state, 0x00, BLOCKBYTES);
final_domain ^= final_ad_domain(adlen, clen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (clen >= BLOCKBYTES) {
precompute_rtk2_3(tks.rtk2_3, m_auth, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
if (clen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
m_auth += BLOCKBYTES;
clen -= BLOCKBYTES;
} else {
memcpy(pad, m_auth, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
clen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (clen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
clen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (clen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else if (clen > BLOCKBYTES) { // Last message double block is partial
clen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else if (clen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (clen > 0) { // Last message single block is partial
for(int i =0; i < (int)clen; i++)
state[i] ^= m[i];
state[15] ^= (u8)clen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
// ----------------- Process the associated data -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(int i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c_tmp[*mlen+i]; //constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
******************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 1
******************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 2
******************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 3
******************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/******************************************************************************
* Encryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
#define SKINNY128_384_ROUNDS 40
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
mixcolumns_0(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
mixcolumns_1(state); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
mixcolumns_2(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
mixcolumns_3(state); \
})
#endif // SKINNY128_H_
\ No newline at end of file
/******************************************************************************
* Implementation of the SKINNY tweakey schedule to match fixslicing.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
u32 rconst_32_bs[160] = {
0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff,
0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
0x00000010, 0x00000000, 0x00010010, 0xfffffbff
};
/******************************************************************************
* Pack the input into the bitsliced representation
* 24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
* 25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
* 26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
* 27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
******************************************************************************/
void packing(u32* out, const u8* in) {
u32 tmp;
LE_LOAD(out, in);
LE_LOAD(out + 1, in + 8);
LE_LOAD(out + 2, in + 4);
LE_LOAD(out + 3, in + 12);
SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[3], out[0], 0x03030303, 6);
SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[3], out[2], 0x03030303, 4);
SWAPMOVE(out[3], out[1], 0x03030303, 2);
}
/******************************************************************************
* Unpack the input to a byte-wise representation
******************************************************************************/
void unpacking(u8* out, u32 *in) {
u32 tmp;
SWAPMOVE(in[3], in[1], 0x03030303, 2);
SWAPMOVE(in[3], in[2], 0x03030303, 4);
SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[3], in[0], 0x03030303, 6);
SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
LE_STORE(out, in[0]);
LE_STORE(out + 8, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out + 12, in[3]);
}
/******************************************************************************
* 0 4 1 5
* 1 5 ---> 2 6
* 2 6 3 7
* 3 7 4 0
******************************************************************************/
void lfsr2_bs(u32* tk) {
u32 tmp;
tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[0] = tk[1];
tk[1] = tk[2];
tk[2] = tk[3];
tk[3] = tmp;
}
/******************************************************************************
* 0 4 7 3
* 1 5 ---> 0 4
* 2 6 1 5
* 3 7 2 6
******************************************************************************/
void lfsr3_bs(u32* tk) {
u32 tmp;
tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[3] = tk[2];
tk[2] = tk[1];
tk[1] = tk[0];
tk[0] = tmp;
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, twice
******************************************************************************/
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 4 times
******************************************************************************/
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 6 times
******************************************************************************/
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 8 times
******************************************************************************/
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 10 times
******************************************************************************/
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 12 times
******************************************************************************/
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 14 times
******************************************************************************/
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00;
tk[i] |= ROR(tmp,14) & 0x00cc0000;
tk[i] |= ROR(tmp,30) & 0xcc000000;
tk[i] |= ROR(tmp,16) & 0x000000ff;
tk[i] |= ROR(tmp,18) & 0x33003300;
}
}
/******************************************************************************
* Precompute all LFSRs on TK2
******************************************************************************/
void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
u32 tk2[4];
packing(tk2, key);
memcpy(tk, tk2, 16);
for(int i = 0 ; i < rounds; i+=2) {
lfsr2_bs(tk2);
memcpy(tk+i*4+4, tk2, 16);
}
}
/******************************************************************************
* Precompute all LFSRs on TK3
******************************************************************************/
void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
u32 tk3[4];
packing(tk3, key);
tk[0] ^= tk3[0];
tk[1] ^= tk3[1];
tk[2] ^= tk3[2];
tk[3] ^= tk3[3];
for(int i = 0 ; i < rounds; i+=2) {
lfsr3_bs(tk3);
tk[i*4+4] ^= tk3[0];
tk[i*4+5] ^= tk3[1];
tk[i*4+6] ^= tk3[2];
tk[i*4+7] ^= tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
for(int j = 0; j < 4; j++) {
tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
}
tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
}
if (test && (i+8 < rounds)) { //only if next loop iteration
tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
}
}
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
rtk[i*4+j] ^= rconst_32_bs[i*4+j];
}
}
/******************************************************************************
* Precompute RTK1.
******************************************************************************/
void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16);
}
\ No newline at end of file
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#endif // TK_SCHEDULE_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
#include "skinny128.h"
#include "romulus.h"
#include <string.h>
u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
u8 domain = 0;
u32 leftover;
//Determine which domain bits we need based on the length of the ad
if (adlen == 0) {
domain ^= 0x02; // No message, so only 1 block with padding
} else {
leftover = (u32)(adlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd ad length?
domain ^= 0x08; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x02; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x0A; // Even with a partial double block at the end
}
}
//Determine which domain bits we need based on the length of the message
if (mlen == 0) {
domain ^= 0x01; // No message, so only 1 block with padding
} else {
leftover = (u32)(mlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd message length?
domain ^= 0x04; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x01; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x05; // Even with a partial double block at the end
}
}
return domain;
}
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 tmp_mlen = mlen;
u32 tmp;
const u8* m_auth = m;
u8 final_domain = 0x30;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
final_domain ^= final_ad_domain(adlen, mlen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (tmp_mlen >= BLOCKBYTES) {
tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
if (tmp_mlen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
m_auth += BLOCKBYTES;
tmp_mlen -= BLOCKBYTES;
} else {
memcpy(pad, m_auth, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
tmp_mlen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (tmp_mlen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
tmp_mlen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (tmp_mlen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (tmp_mlen > BLOCKBYTES) { // Last message double block is partial
tmp_mlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (tmp_mlen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (tmp_mlen > 0) { // Last message single block is partial
for(int i =0; i < (int)tmp_mlen; i++)
state[i] ^= m_auth[i];
state[15] ^= (u8)tmp_mlen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
// ----------------- Process the associated data -----------------
// ----------------- Generate the tag -----------------
G(state,state);
memcpy(c + mlen, state, TAGBYTES);
// ----------------- Generate the tag -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (mlen > 0) {
SET_DOMAIN(tks, 0x24);
while (mlen > BLOCKBYTES) {
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
for(int i = 0; i < (int)mlen; i++) {
tmp = m[i]; // Use of tmp variable in case c = m
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; // Padding
}
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp, tmp_mlen;
u8 final_domain = 0x30;
u8* m_auth = m;
const u8* c_tmp = c;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memcpy(state, c + clen, TAGBYTES);
tmp_mlen = clen;
if (tmp_mlen > 0) {
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
SET_DOMAIN(tks, 0x24);
while (tmp_mlen > BLOCKBYTES) {
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
RHO_INV(state, c, m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
tmp_mlen -= BLOCKBYTES;
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
for(int i = 0; i < (int)tmp_mlen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)tmp_mlen; // Padding
}
// ----------------- Process the ciphertext -----------------
// ----------------- Process the associated data -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
memset(state, 0x00, BLOCKBYTES);
final_domain ^= final_ad_domain(adlen, clen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (clen >= BLOCKBYTES) {
tkschedule_lfsr(tks.rtk, m_auth, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
if (clen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
m_auth += BLOCKBYTES;
clen -= BLOCKBYTES;
} else {
memcpy(pad, m_auth, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
clen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (clen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
clen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (clen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
tkschedule_lfsr(tks.rtk, m_auth + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (clen > BLOCKBYTES) { // Last message double block is partial
clen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else if (clen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (clen > 0) { // Last message single block is partial
for(int i =0; i < (int)clen; i++)
state[i] ^= m[i];
state[15] ^= (u8)clen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
// ----------------- Process the associated data -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(int i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c_tmp[*mlen+i]; //constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_128_ROUNDS 40
#define SKINNY128_256_ROUNDS 48
#define SKINNY128_384_ROUNDS 56
extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
/*******************************************************************************
* ARM assembly implementation of fixsliced SKINNY-128-384.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
.syntax unified
.thumb
/*******************************************************************************
* applies P^2 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p2:
movw r1, #0xcc00
movt r1, #0xcc00 //r1 <- 0xcc00cc00
movw r10, #0xcc00
movt r10, #0x0033 //r10<- 0xcc000033
and r11, r1, r6, ror #14
bfi r11, r6, #16, #8
and r12, r6, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r6
orr r11, r11, r12, lsr #8
and r12, r6, #0x00cc0000
orr r6, r11, r12, lsr #18
and r11, r1, r7, ror #14
bfi r11, r7, #16, #8
and r12, r7, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r7
orr r11, r11, r12, lsr #8
and r12, r7, #0x00cc0000
orr r7, r11, r12, lsr #18
and r11, r1, r8, ror #14
bfi r11, r8, #16, #8
and r12, r8, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r8
orr r11, r11, r12, lsr #8
and r12, r8, #0x00cc0000
orr r8, r11, r12, lsr #18
and r11, r1, r9, ror #14
bfi r11, r9, #16, #8
and r12, r9, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r9
orr r11, r11, r12, lsr #8
and r12, r9, #0x00cc0000
orr r9, r11, r12, lsr #18
bx lr
/*******************************************************************************
* applies P^4 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p4:
str.w r14, [sp] //store r14 on the stack
movw r14, #0x00cc
movt r14, #0xcc00 //r14<- 0xcc0000cc
movw r12, #0xcc00
movt r12, #0x3300 //r12<- 0x3300cc00
movw r11, #0x00cc
movt r11, #0x00cc //r11<- 0x00cc00cc
and r10, r14, r6, ror #22
and r1, r12, r6, ror #16
orr r10, r10, r1
and r1, r6, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r6, r6, r1
orr r6, r10, r6, ror #24
and r10, r14, r7, ror #22
and r1, r12, r7, ror #16
orr r10, r10, r1
and r1, r7, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r7, r7, r1
orr r7, r10, r7, ror #24
and r10, r14, r8, ror #22
and r1, r12, r8, ror #16
orr r10, r10, r1
and r1, r8, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r8, r8, r1
orr r8, r10, r8, ror #24
and r10, r14, r9, ror #22
ldr.w r14, [sp] //restore r14
and r12, r12, r9, ror #16
orr r10, r10, r12
and r12, r9, r11
orr r10, r10, r12, lsr #2
movw r12, #0xcc33 //r1 <- 0x0000cc33
and r9, r9, r12
orr r9, r10, r9, ror #24
bx lr
/*******************************************************************************
* applies P^6 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p6:
movw r1, #0x3333 //r1 <- 0x00003333
movw r12, #0x00cc
movt r12, #0x3300 //r12<- 0x330000cc
and r10, r6, r1, ror #8 // --- permute r6 6 times
and r11, r12, r6, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r6, ror #10
orr r11, r11, r10
and r10, r6, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r6, #0x00003300
orr r6, r11, r10, lsl #2 // permute r6 6 times ---
and r10, r7, r1, ror #8 // --- permute r7 6 times
and r11, r12, r7, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r7, ror #10
orr r11, r11, r10
and r10, r7, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r7, #0x00003300
orr r7, r11, r10, lsl #2 // permute r7 6 times ---
and r10, r8, r1, ror #8 // --- permute r8 6 times
and r11, r12, r8, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r8, ror #10
orr r11, r11, r10
and r10, r8, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r8, #0x00003300
orr r8, r11, r10, lsl #2 // permute r8 6 times ---
and r10, r9, r1, ror #8 // --- permute r9 6 times
and r11, r12, r9, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r9, ror #10
orr r11, r11, r10
and r10, r9, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r9, #0x00003300 // permute r9 6 times ---
orr r9, r11, r10, lsl #2
bx lr
/*******************************************************************************
* applies P^8 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p8:
movw r12, #0x3333 //r12<- 0x00003333
movw r1, #0x0000
movt r1, #0x33cc //r1 <- 0x33cc0000
and r10, r6, r1 // --- permute r6 8 times
and r11, r1, r6, ror #8
orr r11, r11, r10, ror #24
and r10, r6, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r6, r12, lsl #8
orr r6, r11, r10, lsr #6 // permute r6 8 times ---
and r10, r7, r1 // --- permute r7 8 times
and r11, r1, r7, ror #8
orr r11, r11, r10, ror #24
and r10, r7, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r7, r12, lsl #8
orr r7, r11, r10, lsr #6 // permute r7 8 times ---
and r10, r8, r1 // --- permute r8 8 times
and r11, r1, r8, ror #8
orr r11, r11, r10, ror #24
and r10, r8, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r8, r12, lsl #8
orr r8, r11, r10, lsr #6 // permute r8 8 times ---
and r10, r9, r1 // --- permute r9 8 times
and r11, r1, r9, ror #8
orr r11, r11, r10, ror #24
and r10, r9, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r9, r12, lsl #8
orr r9, r11, r10, lsr #6 // permute r9 8 times ---
bx lr
/*******************************************************************************
* applies P^10 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p10:
movw r12, #0x0033
movt r12, #0x3300 //r12<- 0x33000033
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r10, r6, r1, ror #8 // --- permute r6 10 times
and r11, r12, r6, ror #26
orr r11, r11, r10, ror #8
and r10, r6, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r6, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r6, #0x0000cc00
orr r6, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r7, r1, ror #8 // --- permute r6 10 times
and r11, r12, r7, ror #26
orr r11, r11, r10, ror #8
and r10, r7, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r7, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r7, #0x0000cc00
orr r7, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r8, r1, ror #8 // --- permute r6 10 times
and r11, r12, r8, ror #26
orr r11, r11, r10, ror #8
and r10, r8, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r8, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r8, #0x0000cc00
orr r8, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r9, r1, ror #8 // --- permute r6 10 times
and r11, r12, r9, ror #26
orr r11, r11, r10, ror #8
and r10, r9, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r9, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r9, #0x0000cc00
orr r9, r11, r10, lsr #2 // permute r6 10 times ---
bx lr
/*******************************************************************************
* applies P^12 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p12:
str.w r14, [sp] //store r14 on the stack
movw r14, #0xcc33 //r14<- 0x0000cc33
movw r12, #0x00cc
movt r12, #0x00cc //r12<- 0x00cc00cc
movw r1, #0x3300
movt r1, #0xcc00 //r1 <- 0xcc003300
and r10, r14, r6, ror #8 // --- permute r6 12 times
and r11, r12, r6, ror #30
orr r11, r11, r10
and r10, r1, r6, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r6, r10, ror #8
orr r6, r11, r10, ror #10 // permute r6 12 times ---
and r10, r14, r7, ror #8 // --- permute r7 12 times
and r11, r12, r7, ror #30
orr r11, r11, r10
and r10, r1, r7, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r7, r10, ror #8
orr r7, r11, r10, ror #10 // permute r7 12 times ---
and r10, r14, r8, ror #8 // --- permute r8 12 times
and r11, r12, r8, ror #30
orr r11, r11, r10
and r10, r1, r8, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r8, r10, ror #8
orr r8, r11, r10, ror #10 // permute r8 12 times ---
and r10, r14, r9, ror #8 // --- permute r9 12 times
and r11, r12, r9, ror #30
orr r11, r11, r10
and r10, r1, r9, ror #16
ldr.w r14, [sp]
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r9, r10, ror #8
orr r9, r11, r10, ror #10 // permute r9 12 times ---
bx lr
/*******************************************************************************
* applies P^14 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p14:
movw r1, #0xcc00
movt r1, #0x0033 //r1 <- 0x0033cc00
movw r12, #0xcc00
movt r12, #0xcc00 //r12<- 0x33003300
and r10, r1, r6, ror #24 // --- permute r6 14 times
and r11, r6, #0x00000033
orr r11, r10, r11, ror #14
and r10, r6, #0x33000000
orr r11, r11, r10, ror #30
and r10, r6, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r6, r12
orr r6, r11, r10, ror #18 // permute r6 14 times ---
and r10, r1, r7, ror #24 // --- permute r7 14 times
and r11, r7, #0x00000033
orr r11, r10, r11, ror #14
and r10, r7, #0x33000000
orr r11, r11, r10, ror #30
and r10, r7, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r7, r12
orr r7, r11, r10, ror #18 // permute r7 14 times ---
and r10, r1, r8, ror #24 // --- permute r8 14 times
and r11, r8, #0x00000033
orr r11, r10, r11, ror #14
and r10, r8, #0x33000000
orr r11, r11, r10, ror #30
and r10, r8, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r8, r12
orr r8, r11, r10, ror #18 // permute r8 14 times ---
and r10, r1, r9, ror #24 // --- permute r9 14 times
and r11, r9, #0x00000033
orr r11, r10, r11, ror #14
and r10, r9, #0x33000000
orr r11, r11, r10, ror #30
and r10, r9, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r9, r12
orr r9, r11, r10, ror #18 // permute r9 14 times ---
bx lr
.align 2
packing:
eor r12, r2, r2, lsr #3
and r12, r12, r10
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r10
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r10
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r10
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r11
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r11, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r11, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r11, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r11, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r11, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
bx lr
/******************************************************************************
* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
* Performing both at the same time allows to save some memory accesses.
******************************************************************************/
@ void tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
.global tkschedule_lfsr
.type tkschedule_lfsr,%function
.align 2
tkschedule_lfsr:
push {r0-r12, r14}
ldr.w r3, [r1, #8] //load tk2 (3rd word)
ldr.w r4, [r1, #4] //load tk2 (2nd word)
ldr.w r5, [r1, #12] //load tk2 (4th word)
ldr.w r12, [r1] //load tk2 (1st word)
mov r1, r2 //move tk3 address in r1
mov r2, r12 //move 1st tk2 word in r2
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk2
mov r6, r2 //move tk2 from r2-r5 to r6-r9
mov r7, r3 //move tk2 from r2-r5 to r6-r9
mov r8, r4 //move tk2 from r2-r5 to r6-r9
mov r9, r5 //move tk2 from r2-r5 to r6-r9
ldr.w r3, [r1, #8] //load tk3 (3rd word)
ldr.w r4, [r1, #4] //load tk3 (2nd word)
ldr.w r5, [r1, #12] //load tk3 (4th) word)
ldr.w r2, [r1] //load tk3 (1st) word)
bl packing //pack tk3
eor r10, r10, r10, lsl #4 //r10<- 0xaaaaaaaa
ldr.w r1, [sp, #12] //load loop counter in r1
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #8 //store in tk
loop:
and r12, r8, r10 // --- apply LFSR2 to tk2
eor r12, r12, r6
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r6, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r3, r10 // --- apply LFSR3 to tk3
eor r12, r5, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r5, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r5, r7 //tk2 ^ tk3 (1st word)
eor r12, r2, r8 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r3, r9 //tk2 ^ tk3 (3rd word)
eor r12, r4, r6 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r9, r10 // --- apply LFSR2 to tk2
eor r12, r12, r7
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r7, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r2, r10 // --- apply LFSR3 to tk3
eor r12, r4, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r4, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r4, r8 //tk2 ^ tk3 (1st word)
eor r12, r5, r9 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r2, r6 //tk2 ^ tk3 (3rd word)
eor r12, r3, r7 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r6, r10 // --- apply LFSR2 to tk2
eor r12, r12, r8
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r8, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r5, r10 // --- apply LFSR3 to tk3
eor r12, r3, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r3, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r3, r9 //tk2 ^ tk3 (1st word)
eor r12, r4, r6 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r5, r7 //tk2 ^ tk3 (3rd word)
eor r12, r2, r8 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r7, r10 // --- apply LFSR2 to tk2
eor r12, r12, r9
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r9, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r4, r10 // --- apply LFSR3 to tk3
eor r12, r2, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r2, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
subs.w r1, r1, #8 //decrease loop counter by 8
bne loop
pop {r0-r12, r14}
bx lr
/******************************************************************************
* Applies the permutation P and add the round constants to all round tweakeys.
******************************************************************************/
@ void tkschedule_perm(u32* tk)
.global tkschedule_perm
.type tkschedule_perm,%function
.align 2
tkschedule_perm:
push {r0-r12, lr}
sub.w sp, #4 //to store r14 in subroutines
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r6, r6, r10 //tk &= 0xf0f0f0f0 (1st word)
and r7, r7, r10 //tk &= 0xf0f0f0f0 (2nd word)
and r8, r8, r10 //tk &= 0xf0f0f0f0 (3rd word)
and r9, r9, r10 //tk &= 0xf0f0f0f0 (4th word)
eor r8, r8, #0x00000004 //add rconst
eor r9, r9, #0x00000040 //add rconst
mvn r9, r9 //to remove a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 1st round
strd r6, r7, [r0], #8 //store 2nd half tk for 1st round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 3rd round
strd r6, r7, [r0], #8 //store 2nd half tk for 3rd round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 5th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 5th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 4th round
strd r8, r9, [r0], #24 //store 2nd half tk for 4th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 7th round
strd r6, r7, [r0], #8 //store 2nd half tk for 7th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 9th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000054 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 9th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000410
eor r9, r9, #0x00000410
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 8th round
strd r8, r9, [r0], #24 //store 2nd half tk for 8th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r6, r6, #0x04000000 //add rconst
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 11th round
strd r6, r7, [r0], #8 //store 2nd half tk for 11th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 13th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00140000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 13th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 12th round
strd r8, r9, [r0], #24 //store 2nd half tk for 12th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
eor r9, r9, #0x00000400 //add const
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 15th round
strd r6, r7, [r0], #8 //store 2nd half tk for 15th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 17th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 17th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 16th round
strd r8, r9, [r0], #24 //store 2nd half tk for 16th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 18th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 18th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
eor r9, r9, #0x04000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 19th round
strd r6, r7, [r0], #8 //store 2nd half tk for 19th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 21th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00100000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 21th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x04100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 20th round
strd r8, r9, [r0], #24 //store 2nd half tk for 20th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 22th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 22th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 23th round
strd r6, r7, [r0], #8 //store 2nd half tk for 23th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 25th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 25th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 24th round
strd r8, r9, [r0], #24 //store 2nd half tk for 24th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 26th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 26th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 27th round
strd r6, r7, [r0], #8 //store 2nd half tk for 27th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 29th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 29th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 28th round
strd r8, r9, [r0], #24 //store 2nd half tk for 28th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 30th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 30th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 31th round
strd r6, r7, [r0], #8 //store 2nd half tk for 31th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 33th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 33th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 32th round
strd r8, r9, [r0], #24 //store 2nd half tk for 32th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 34th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 34th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 35th round
strd r6, r7, [r0], #8 //store 2nd half tk for 35th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 37th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 37th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 36th round
strd r8, r9, [r0], #24 //store 2nd half tk for 36th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 38th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 38th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 39th round
strd r6, r7, [r0], #8 //store 2nd half tk for 39th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 41th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000010 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 41th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 40th round
strd r8, r9, [r0], #24 //store 2nd half tk for 40th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 42th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 42th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 43th round
strd r6, r7, [r0], #8 //store 2nd half tk for 43th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 45th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00040000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 45th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 44th round
strd r8, r9, [r0], #24 //store 2nd half tk for 44th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 46th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 46th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 47th round
strd r6, r7, [r0], #8 //store 2nd half tk for 47th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 49th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 49th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 48th round
strd r8, r9, [r0], #24 //store 2nd half tk for 48th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 50th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000140 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 50th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 51th round
strd r6, r7, [r0], #8 //store 2nd half tk for 51th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 53th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00040000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 53th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 52th round
strd r8, r9, [r0], #24 //store 2nd half tk for 52th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 54th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 54th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 55th round
strd r6, r7, [r0], #8 //store 2nd half tk for 55th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0x3030
movt r10, #0x3030 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 56th round
strd r8, r9, [r0], #24 //store 2nd half tk for 56th round
add.w sp, #4
pop {r0-r12, lr}
bx lr
/******************************************************************************
* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
* don't need more calculations as no LFSR is applied to TK1.
******************************************************************************/
@ void tkschedule_perm_tk1(u32* tk, const u8* key)
.global tkschedule_perm_tk1
.type tkschedule_perm_tk1,%function
.align 2
tkschedule_perm_tk1:
push {r0-r12, lr}
ldr.w r3, [r1, #8] //load tk1 (3rd word)
ldr.w r4, [r1, #4] //load tk1 (2nd word)
ldr.w r5, [r1, #12] //load tk1 (4th word)
ldr.w r2, [r1] //load tk1 (1st word)
movw r10, #0x0a0a
movt r10, #0x0a0a //r6 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk1
mov r6, r2 //move tk1 from r2-r5 to r6-r9
mov r7, r3 //move tk1 from r2-r5 to r6-r9
mov r8, r4 //move tk1 from r2-r5 to r6-r9
mov r9, r5 //move tk1 from r2-r5 to r6-r9
movw r2, #0xf0f0
movt r2, #0xf0f0 //r2<- 0xf0f0f0f0
and r11, r8, r2 //tk &= 0xf0f0f0f0 (3rd word)
and r12, r9, r2 //tk &= 0xf0f0f0f0 (4th word)
strd r11, r12, [r0], #8 //store 1st half tk for 1st round
and r11, r6, r2 //tk &= 0xf0f0f0f0 (1st word)
and r12, r7, r2 //tk &= 0xf0f0f0f0 (2nd word)
strd r11, r12, [r0], #8 //store 2nd half tk for 1st round
bl p2 //apply the permutation twice
movw r3, #0x0303
movt r3, #0x0303 //r3<- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 4 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 6 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 8 times
and r11, r2, r6 //ror and mask to match fixslicing
and r12, r2, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 9th round
and r11, r2, r8 //ror and mask to match fixslicing
and r12, r2, r9 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 9th round
bl p2 //apply the permutation 10
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 12 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 14 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0] //ror and masks to match fixslicing ---
pop {r0-r12, lr}
bx lr
.align 2
quadruple_round:
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #30 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #24
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #30
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #2
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #10 // --- mixcolumns 2 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #26
eor r2, r2, r8
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #26
eor r5, r5, r8
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #26
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #22
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
bx lr
/******************************************************************************
* Encrypt a single block using fixsliced SKINNY-128-128.
******************************************************************************/
@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
.global skinny128_384
.type skinny128_384,%function
.align 2
skinny128_384:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
movw r7, #0x3030
movt r7, #0x3030 //r7 <- 0x30303030
eor r12, r2, r2, lsr #3
and r12, r12, r6
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r6
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r6
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r6
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r7
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r7, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r7, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r7, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r7, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r7, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
eor r10, r3, r5, lsr #2
and r10, r10, r7, lsr #4
eor r3, r3, r10
eor r5, r5, r10, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
eor r10, r4, r5, lsr #4
and r10, r10, r7, lsr #4
eor r4, r4, r10
eor r5, r5, r10, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r10, r4, r3, lsr #2
and r10, r10, r7, lsr #2
eor r4, r4, r10
eor r3, r3, r10, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r10, r2, r5, lsr #6
and r10, r10, r7, lsr #4
eor r2, r2, r10
eor r5, r5, r10, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r10, r2, r3, lsr #4
and r10, r10, r7, lsr #2
eor r2, r2, r10
eor r3, r3, r10, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r10, r2, r4, lsr #2
and r10, r10, r7
eor r2, r2, r10
eor r4, r4, r10, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r10, r5, r5, lsr #3
and r10, r10, r6
eor r5, r5, r10
eor r5, r5, r10, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r10, r4, r4, lsr #3
and r10, r10, r6
eor r4, r4, r10
eor r4, r4, r10, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r10, r3, r3, lsr #3
and r10, r10, r6
eor r3, r3, r10
eor r3, r3, r10, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r10, r2, r2, lsr #3
and r10, r10, r6
eor r2, r2, r10
eor r2, r2, r10, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
#include "skinny128.h"
#include "tk_schedule.h"
#include "romulus.h"
#include <string.h>
#include <stdio.h>
static u8 final_ad_domain (unsigned long long adlen, unsigned long long mlen) {
u8 domain = 0;
u32 leftover;
//Determine which domain bits we need based on the length of the ad
if (adlen == 0) {
domain ^= 0x02; // No message, so only 1 block with padding
} else {
leftover = (u32)(adlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd ad length?
domain ^= 0x08; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x02; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x0A; // Even with a partial double block at the end
}
}
//Determine which domain bits we need based on the length of the message
if (mlen == 0) {
domain ^= 0x01; // No message, so only 1 block with padding
} else {
leftover = (unsigned)(mlen % (2 * BLOCKBYTES));
if (leftover == 0) { // Even or odd message length?
domain ^= 0x04; // Even with a full double block at the end
} else if (leftover < BLOCKBYTES) {
domain ^= 0x01; // Odd with a partial single block at the end
} else if (leftover > BLOCKBYTES) {
domain ^= 0x05; // Even with a partial double block at the end
}
}
return domain;
}
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
u64 tmp_mlen = mlen;
const u8* m_auth = m;
u8 final_domain = 0x30;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
final_domain ^= final_ad_domain(adlen, mlen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (tmp_mlen >= BLOCKBYTES) {
precompute_rtk2_3(tks.rtk2_3, m_auth, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
if (tmp_mlen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
tmp_mlen -= BLOCKBYTES;
m_auth += BLOCKBYTES;
} else {
memcpy(pad, m_auth, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
tmp_mlen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (tmp_mlen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
tmp_mlen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (tmp_mlen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else if (tmp_mlen > BLOCKBYTES) { // Last message double block is partial
tmp_mlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, tmp_mlen);
memset(pad + tmp_mlen, 0x00, BLOCKBYTES - tmp_mlen - 1);
pad[15] = (u8)tmp_mlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else if (tmp_mlen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (tmp_mlen > 0) { // Last message single block is partial
for(int i =0; i < (int)tmp_mlen; i++)
state[i] ^= m_auth[i];
state[15] ^= (u8)tmp_mlen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
// ----------------- Process the associated data -----------------
// ----------------- Generate the tag -----------------
G(state,state);
memcpy(c + mlen, state, TAGBYTES);
// ----------------- Generate the tag -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (mlen > 0) {
SET_DOMAIN(tks, 0x24);
while (mlen > BLOCKBYTES) {
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
for(int i = 0; i < (int)mlen; i++) {
tmp = m[i]; // Use of tmp variable in case c = m
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; // Padding
}
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
u64 tmp_mlen;
u8 final_domain = 0x30;
u8* m_auth = m;
const u8* c_tmp = c;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memcpy(state, c + clen, TAGBYTES);
tmp_mlen = clen;
if (tmp_mlen > 0) {
SET_DOMAIN(tks, 0x24);
precompute_rtk2_3(tks.rtk2_3, npub, k);
while (tmp_mlen > BLOCKBYTES) {
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
RHO_INV(state, c, m);
UPDATE_CTR(tks.tk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
tmp_mlen -= BLOCKBYTES;
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
for(int i = 0; i < (int)tmp_mlen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)tmp_mlen; // Padding
}
// ----------------- Process the ciphertext -----------------
// ----------------- Process the associated data -----------------
memset(tks.tk1, 0x00, KEYBYTES);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
memset(state, 0x00, BLOCKBYTES);
final_domain ^= final_ad_domain(adlen, clen);
SET_DOMAIN(tks, 0x28);
while (adlen > 2*BLOCKBYTES) { // Process double blocks but the last
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
} else {
SET_DOMAIN(tks, 0x2C);
UPDATE_CTR(tks.tk1);
if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
} else { // Left-over partial single block
for(int i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
}
if (clen >= BLOCKBYTES) {
precompute_rtk2_3(tks.rtk2_3, m_auth, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
if (clen > BLOCKBYTES)
UPDATE_CTR(tks.tk1);
m_auth += BLOCKBYTES;
clen -= BLOCKBYTES;
} else {
memcpy(pad, m_auth, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
clen = 0;
}
}
// Process all message double blocks except the last
SET_DOMAIN(tks, 0x2C);
while (clen > 32) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
m_auth += 2 * BLOCKBYTES;
clen -= 2 * BLOCKBYTES;
}
// Process the last message double block
if (clen == 2 * BLOCKBYTES) { // Last message double block is full
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
precompute_rtk2_3(tks.rtk2_3, m_auth + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else if (clen > BLOCKBYTES) { // Last message double block is partial
clen -= BLOCKBYTES;
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, m_auth);
memcpy(pad, m_auth + BLOCKBYTES, clen);
memset(pad + clen, 0x00, BLOCKBYTES - clen - 1);
pad[15] = (u8)clen; // Padding
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else if (clen == BLOCKBYTES) { // Last message single block is full
XOR_BLOCK(state, state, m_auth);
} else if (clen > 0) { // Last message single block is partial
for(int i =0; i < (int)clen; i++)
state[i] ^= m[i];
state[15] ^= (u8)clen; // Padding
}
// Process the last partial block
SET_DOMAIN(tks, final_domain);
UPDATE_CTR(tks.tk1);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
// ----------------- Process the associated data -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(int i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c_tmp[*mlen+i]; //constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned int u64;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
******************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 1
******************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 2
******************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 3
******************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/******************************************************************************
* Encryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 56
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
mixcolumns_0(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
mixcolumns_1(state); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
mixcolumns_2(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
mixcolumns_3(state); \
})
void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
#endif // SKINNY128_H_
\ No newline at end of file
/******************************************************************************
* Implementation of the SKINNY tweakey schedule to match fixslicing.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
u32 rconst_32_bs[224] = {
0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff,
0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
0x00000010, 0x00000000, 0x00010010, 0xfffffbff, 0x00000014, 0xffffffef,
0x00000000, 0x00000040, 0x00000100, 0x00000000, 0x10000040, 0xfffffeff,
0x44000000, 0xfffffeff, 0x00000000, 0x00000000, 0x00000000, 0x00100000,
0x04000001, 0xffffffff, 0x00040000, 0xffffffff, 0x00400000, 0x00000000,
0x00000000, 0x00000000, 0x00001000, 0xfebfffff, 0x01004400, 0xffffffff,
0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00010000, 0xffffffff,
0x00000004, 0xffffffbf, 0x00000040, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffebf, 0x44000100, 0xffffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00000001, 0xffffffff, 0x00040000, 0xffafffff,
0x00400000, 0x00000000, 0x00000000, 0x00000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfffffbff, 0x00000000, 0x00000400, 0x00000010, 0x00000000,
0x00010010, 0xffffffff
};
/******************************************************************************
* Pack the input into the bitsliced representation
* 24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
* 25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
* 26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
* 27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
******************************************************************************/
void packing(u32* out, const u8* in) {
u32 tmp;
LE_LOAD(out, in);
LE_LOAD(out + 1, in + 8);
LE_LOAD(out + 2, in + 4);
LE_LOAD(out + 3, in + 12);
SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[3], out[0], 0x03030303, 6);
SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[3], out[2], 0x03030303, 4);
SWAPMOVE(out[3], out[1], 0x03030303, 2);
}
/******************************************************************************
* Unpack the input to a byte-wise representation
******************************************************************************/
void unpacking(u8* out, u32 *in) {
u32 tmp;
SWAPMOVE(in[3], in[1], 0x03030303, 2);
SWAPMOVE(in[3], in[2], 0x03030303, 4);
SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[3], in[0], 0x03030303, 6);
SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
LE_STORE(out, in[0]);
LE_STORE(out + 8, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out + 12, in[3]);
}
/******************************************************************************
* 0 4 1 5
* 1 5 ---> 2 6
* 2 6 3 7
* 3 7 4 0
******************************************************************************/
void lfsr2_bs(u32* tk) {
u32 tmp;
tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[0] = tk[1];
tk[1] = tk[2];
tk[2] = tk[3];
tk[3] = tmp;
}
/******************************************************************************
* 0 4 7 3
* 1 5 ---> 0 4
* 2 6 1 5
* 3 7 2 6
******************************************************************************/
void lfsr3_bs(u32* tk) {
u32 tmp;
tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[3] = tk[2];
tk[2] = tk[1];
tk[1] = tk[0];
tk[0] = tmp;
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, twice
******************************************************************************/
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 4 times
******************************************************************************/
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 6 times
******************************************************************************/
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 8 times
******************************************************************************/
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 10 times
******************************************************************************/
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 12 times
******************************************************************************/
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 14 times
******************************************************************************/
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00;
tk[i] |= ROR(tmp,14) & 0x00cc0000;
tk[i] |= ROR(tmp,30) & 0xcc000000;
tk[i] |= ROR(tmp,16) & 0x000000ff;
tk[i] |= ROR(tmp,18) & 0x33003300;
}
}
/******************************************************************************
* Precompute all LFSRs on TK2
******************************************************************************/
void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
u32 tk2[4];
packing(tk2, key);
memcpy(tk, tk2, 16);
for(int i = 0 ; i < rounds; i+=2) {
lfsr2_bs(tk2);
memcpy(tk+i*4+4, tk2, 16);
}
}
/******************************************************************************
* Precompute all LFSRs on TK3
******************************************************************************/
void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
u32 tk3[4];
packing(tk3, key);
tk[0] ^= tk3[0];
tk[1] ^= tk3[1];
tk[2] ^= tk3[2];
tk[3] ^= tk3[3];
for(int i = 0 ; i < rounds; i+=2) {
lfsr3_bs(tk3);
tk[i*4+4] ^= tk3[0];
tk[i*4+5] ^= tk3[1];
tk[i*4+6] ^= tk3[2];
tk[i*4+7] ^= tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
for(int j = 0; j < 4; j++) {
tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
}
tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
}
if (test && (i+8 < rounds)) { //only if next loop iteration
tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
}
}
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
rtk[i*4+j] ^= rconst_32_bs[i*4+j];
}
}
/******************************************************************************
* Precompute RTK1.
******************************************************************************/
void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16);
}
\ No newline at end of file
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#endif // TK_SCHEDULE_H_
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
#include "skinny128.h"
#include "romulus.h"
#include <string.h>
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
if (adlen == 0) { // Handle the special case of no AD
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process double blocks but the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else { // Left-over partial single block
for(int i = 0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
SET_DOMAIN(tks, 0x1A);
}
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the plaintext -----------------
memset(tks.tk1, 0x00, KEYBYTES/2);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (mlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (mlen > BLOCKBYTES) {
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
// Pad and process the last block
UPDATE_CTR(tks.tk1);
if (mlen < BLOCKBYTES) { // Last message single block is full
for(int i = 0; i < (int)mlen; i++) {
tmp = m[i]; // Use of tmp variable in case c = m
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; // Padding
SET_DOMAIN(tks, 0x15);
} else { // Last message single block is partial
RHO(state,c,m);
SET_DOMAIN(tks, 0x14);
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
c += mlen;
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate the tag -----------------
G(c,state);
// ----------------- Generate the tag -----------------
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
if (adlen == 0) { // Handle the special case of no AD
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process double blocks except the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else { // Left-over partial single block
for(int i = 0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
SET_DOMAIN(tks, 0x1A);
}
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES/2);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (clen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (clen > BLOCKBYTES) {
RHO_INV(state,c,m);
UPDATE_CTR(tks.tk1);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
clen -= BLOCKBYTES;
}
// Pad and process the last block
UPDATE_CTR(tks.tk1);
if (clen < BLOCKBYTES) { // Last message double block is partial
for(int i = 0; i < (int)clen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)clen; // Padding
SET_DOMAIN(tks, 0x15);
} else { // Last message double block is full
RHO_INV(state,c,m);
SET_DOMAIN(tks, 0x14);
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(int i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c[clen+i]; // Constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef struct {
u8 tk1[16]; //to manipulate tk1 in a byte-wise manner
u32 rtk1[32]; //to avoid recomputation of the tk schedule
u32 rtk[4*SKINNY128_384_ROUNDS];//all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 40
extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
/*******************************************************************************
* ARM assembly implementation of fixsliced SKINNY-128-384.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
.syntax unified
.thumb
/*******************************************************************************
* applies P^2 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p2:
movw r1, #0xcc00
movt r1, #0xcc00 //r1 <- 0xcc00cc00
movw r10, #0xcc00
movt r10, #0x0033 //r10<- 0xcc000033
and r11, r1, r6, ror #14
bfi r11, r6, #16, #8
and r12, r6, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r6
orr r11, r11, r12, lsr #8
and r12, r6, #0x00cc0000
orr r6, r11, r12, lsr #18
and r11, r1, r7, ror #14
bfi r11, r7, #16, #8
and r12, r7, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r7
orr r11, r11, r12, lsr #8
and r12, r7, #0x00cc0000
orr r7, r11, r12, lsr #18
and r11, r1, r8, ror #14
bfi r11, r8, #16, #8
and r12, r8, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r8
orr r11, r11, r12, lsr #8
and r12, r8, #0x00cc0000
orr r8, r11, r12, lsr #18
and r11, r1, r9, ror #14
bfi r11, r9, #16, #8
and r12, r9, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r9
orr r11, r11, r12, lsr #8
and r12, r9, #0x00cc0000
orr r9, r11, r12, lsr #18
bx lr
/*******************************************************************************
* applies P^4 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p4:
str.w r14, [sp] //store r14 on the stack
movw r14, #0x00cc
movt r14, #0xcc00 //r14<- 0xcc0000cc
movw r12, #0xcc00
movt r12, #0x3300 //r12<- 0x3300cc00
movw r11, #0x00cc
movt r11, #0x00cc //r11<- 0x00cc00cc
and r10, r14, r6, ror #22
and r1, r12, r6, ror #16
orr r10, r10, r1
and r1, r6, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r6, r6, r1
orr r6, r10, r6, ror #24
and r10, r14, r7, ror #22
and r1, r12, r7, ror #16
orr r10, r10, r1
and r1, r7, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r7, r7, r1
orr r7, r10, r7, ror #24
and r10, r14, r8, ror #22
and r1, r12, r8, ror #16
orr r10, r10, r1
and r1, r8, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r8, r8, r1
orr r8, r10, r8, ror #24
and r10, r14, r9, ror #22
ldr.w r14, [sp] //restore r14
and r12, r12, r9, ror #16
orr r10, r10, r12
and r12, r9, r11
orr r10, r10, r12, lsr #2
movw r12, #0xcc33 //r1 <- 0x0000cc33
and r9, r9, r12
orr r9, r10, r9, ror #24
bx lr
/*******************************************************************************
* applies P^6 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p6:
movw r1, #0x3333 //r1 <- 0x00003333
movw r12, #0x00cc
movt r12, #0x3300 //r12<- 0x330000cc
and r10, r6, r1, ror #8 // --- permute r6 6 times
and r11, r12, r6, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r6, ror #10
orr r11, r11, r10
and r10, r6, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r6, #0x00003300
orr r6, r11, r10, lsl #2 // permute r6 6 times ---
and r10, r7, r1, ror #8 // --- permute r7 6 times
and r11, r12, r7, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r7, ror #10
orr r11, r11, r10
and r10, r7, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r7, #0x00003300
orr r7, r11, r10, lsl #2 // permute r7 6 times ---
and r10, r8, r1, ror #8 // --- permute r8 6 times
and r11, r12, r8, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r8, ror #10
orr r11, r11, r10
and r10, r8, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r8, #0x00003300
orr r8, r11, r10, lsl #2 // permute r8 6 times ---
and r10, r9, r1, ror #8 // --- permute r9 6 times
and r11, r12, r9, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r9, ror #10
orr r11, r11, r10
and r10, r9, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r9, #0x00003300 // permute r9 6 times ---
orr r9, r11, r10, lsl #2
bx lr
/*******************************************************************************
* applies P^8 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p8:
movw r12, #0x3333 //r12<- 0x00003333
movw r1, #0x0000
movt r1, #0x33cc //r1 <- 0x33cc0000
and r10, r6, r1 // --- permute r6 8 times
and r11, r1, r6, ror #8
orr r11, r11, r10, ror #24
and r10, r6, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r6, r12, lsl #8
orr r6, r11, r10, lsr #6 // permute r6 8 times ---
and r10, r7, r1 // --- permute r7 8 times
and r11, r1, r7, ror #8
orr r11, r11, r10, ror #24
and r10, r7, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r7, r12, lsl #8
orr r7, r11, r10, lsr #6 // permute r7 8 times ---
and r10, r8, r1 // --- permute r8 8 times
and r11, r1, r8, ror #8
orr r11, r11, r10, ror #24
and r10, r8, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r8, r12, lsl #8
orr r8, r11, r10, lsr #6 // permute r8 8 times ---
and r10, r9, r1 // --- permute r9 8 times
and r11, r1, r9, ror #8
orr r11, r11, r10, ror #24
and r10, r9, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r9, r12, lsl #8
orr r9, r11, r10, lsr #6 // permute r9 8 times ---
bx lr
/*******************************************************************************
* applies P^10 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p10:
movw r12, #0x0033
movt r12, #0x3300 //r12<- 0x33000033
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r10, r6, r1, ror #8 // --- permute r6 10 times
and r11, r12, r6, ror #26
orr r11, r11, r10, ror #8
and r10, r6, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r6, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r6, #0x0000cc00
orr r6, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r7, r1, ror #8 // --- permute r6 10 times
and r11, r12, r7, ror #26
orr r11, r11, r10, ror #8
and r10, r7, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r7, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r7, #0x0000cc00
orr r7, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r8, r1, ror #8 // --- permute r6 10 times
and r11, r12, r8, ror #26
orr r11, r11, r10, ror #8
and r10, r8, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r8, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r8, #0x0000cc00
orr r8, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r9, r1, ror #8 // --- permute r6 10 times
and r11, r12, r9, ror #26
orr r11, r11, r10, ror #8
and r10, r9, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r9, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r9, #0x0000cc00
orr r9, r11, r10, lsr #2 // permute r6 10 times ---
bx lr
/*******************************************************************************
* applies P^12 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p12:
str.w r14, [sp] //store r14 on the stack
movw r14, #0xcc33 //r14<- 0x0000cc33
movw r12, #0x00cc
movt r12, #0x00cc //r12<- 0x00cc00cc
movw r1, #0x3300
movt r1, #0xcc00 //r1 <- 0xcc003300
and r10, r14, r6, ror #8 // --- permute r6 12 times
and r11, r12, r6, ror #30
orr r11, r11, r10
and r10, r1, r6, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r6, r10, ror #8
orr r6, r11, r10, ror #10 // permute r6 12 times ---
and r10, r14, r7, ror #8 // --- permute r7 12 times
and r11, r12, r7, ror #30
orr r11, r11, r10
and r10, r1, r7, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r7, r10, ror #8
orr r7, r11, r10, ror #10 // permute r7 12 times ---
and r10, r14, r8, ror #8 // --- permute r8 12 times
and r11, r12, r8, ror #30
orr r11, r11, r10
and r10, r1, r8, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r8, r10, ror #8
orr r8, r11, r10, ror #10 // permute r8 12 times ---
and r10, r14, r9, ror #8 // --- permute r9 12 times
and r11, r12, r9, ror #30
orr r11, r11, r10
and r10, r1, r9, ror #16
ldr.w r14, [sp]
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r9, r10, ror #8
orr r9, r11, r10, ror #10 // permute r9 12 times ---
bx lr
/*******************************************************************************
* applies P^14 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p14:
movw r1, #0xcc00
movt r1, #0x0033 //r1 <- 0x0033cc00
movw r12, #0xcc00
movt r12, #0xcc00 //r12<- 0x33003300
and r10, r1, r6, ror #24 // --- permute r6 14 times
and r11, r6, #0x00000033
orr r11, r10, r11, ror #14
and r10, r6, #0x33000000
orr r11, r11, r10, ror #30
and r10, r6, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r6, r12
orr r6, r11, r10, ror #18 // permute r6 14 times ---
and r10, r1, r7, ror #24 // --- permute r7 14 times
and r11, r7, #0x00000033
orr r11, r10, r11, ror #14
and r10, r7, #0x33000000
orr r11, r11, r10, ror #30
and r10, r7, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r7, r12
orr r7, r11, r10, ror #18 // permute r7 14 times ---
and r10, r1, r8, ror #24 // --- permute r8 14 times
and r11, r8, #0x00000033
orr r11, r10, r11, ror #14
and r10, r8, #0x33000000
orr r11, r11, r10, ror #30
and r10, r8, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r8, r12
orr r8, r11, r10, ror #18 // permute r8 14 times ---
and r10, r1, r9, ror #24 // --- permute r9 14 times
and r11, r9, #0x00000033
orr r11, r10, r11, ror #14
and r10, r9, #0x33000000
orr r11, r11, r10, ror #30
and r10, r9, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r9, r12
orr r9, r11, r10, ror #18 // permute r9 14 times ---
bx lr
.align 2
packing:
eor r12, r2, r2, lsr #3
and r12, r12, r10
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r10
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r10
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r10
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r11
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r11, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r11, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r11, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r11, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r11, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
bx lr
/******************************************************************************
* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
* Performing both at the same time allows to save some memory accesses.
******************************************************************************/
@ void tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
.global tkschedule_lfsr
.type tkschedule_lfsr,%function
.align 2
tkschedule_lfsr:
push {r0-r12, r14}
ldr.w r3, [r1, #8] //load tk2 (3rd word)
ldr.w r4, [r1, #4] //load tk2 (2nd word)
ldr.w r5, [r1, #12] //load tk2 (4th word)
ldr.w r12, [r1] //load tk2 (1st word)
mov r1, r2 //move tk3 address in r1
mov r2, r12 //move 1st tk2 word in r2
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk2
mov r6, r2 //move tk2 from r2-r5 to r6-r9
mov r7, r3 //move tk2 from r2-r5 to r6-r9
mov r8, r4 //move tk2 from r2-r5 to r6-r9
mov r9, r5 //move tk2 from r2-r5 to r6-r9
ldr.w r3, [r1, #8] //load tk3 (3rd word)
ldr.w r4, [r1, #4] //load tk3 (2nd word)
ldr.w r5, [r1, #12] //load tk3 (4th) word)
ldr.w r2, [r1] //load tk3 (1st) word)
bl packing //pack tk3
eor r10, r10, r10, lsl #4 //r10<- 0xaaaaaaaa
ldr.w r1, [sp, #12] //load loop counter in r1
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #8 //store in tk
loop:
and r12, r8, r10 // --- apply LFSR2 to tk2
eor r12, r12, r6
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r6, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r3, r10 // --- apply LFSR3 to tk3
eor r12, r5, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r5, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r5, r7 //tk2 ^ tk3 (1st word)
eor r12, r2, r8 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r3, r9 //tk2 ^ tk3 (3rd word)
eor r12, r4, r6 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r9, r10 // --- apply LFSR2 to tk2
eor r12, r12, r7
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r7, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r2, r10 // --- apply LFSR3 to tk3
eor r12, r4, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r4, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r4, r8 //tk2 ^ tk3 (1st word)
eor r12, r5, r9 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r2, r6 //tk2 ^ tk3 (3rd word)
eor r12, r3, r7 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r6, r10 // --- apply LFSR2 to tk2
eor r12, r12, r8
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r8, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r5, r10 // --- apply LFSR3 to tk3
eor r12, r3, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r3, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r3, r9 //tk2 ^ tk3 (1st word)
eor r12, r4, r6 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r5, r7 //tk2 ^ tk3 (3rd word)
eor r12, r2, r8 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r7, r10 // --- apply LFSR2 to tk2
eor r12, r12, r9
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r9, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r4, r10 // --- apply LFSR3 to tk3
eor r12, r2, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r2, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
subs.w r1, r1, #8 //decrease loop counter by 8
bne loop
pop {r0-r12, r14}
bx lr
@ void tkschedule_perm(u32* tk)
.global tkschedule_perm
.type tkschedule_perm,%function
.align 2
tkschedule_perm:
push {r0-r12, lr}
sub.w sp, #4 //to store r14 in subroutines
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r6, r6, r10 //tk &= 0xf0f0f0f0 (1st word)
and r7, r7, r10 //tk &= 0xf0f0f0f0 (2nd word)
and r8, r8, r10 //tk &= 0xf0f0f0f0 (3rd word)
and r9, r9, r10 //tk &= 0xf0f0f0f0 (4th word)
eor r8, r8, #0x00000004 //add rconst
eor r9, r9, #0x00000040 //add rconst
mvn r9, r9 //to remove a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 1st round
strd r6, r7, [r0], #8 //store 2nd half tk for 1st round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 3rd round
strd r6, r7, [r0], #8 //store 2nd half tk for 3rd round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 5th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 5th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 4th round
strd r8, r9, [r0], #24 //store 2nd half tk for 4th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 7th round
strd r6, r7, [r0], #8 //store 2nd half tk for 7th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 9th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000054 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 9th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000410
eor r9, r9, #0x00000410
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 8th round
strd r8, r9, [r0], #24 //store 2nd half tk for 8th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r6, r6, #0x04000000 //add rconst
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 11th round
strd r6, r7, [r0], #8 //store 2nd half tk for 11th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 13th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00140000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 13th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 12th round
strd r8, r9, [r0], #24 //store 2nd half tk for 12th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
eor r9, r9, #0x00000400 //add const
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 15th round
strd r6, r7, [r0], #8 //store 2nd half tk for 15th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 17th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 17th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 16th round
strd r8, r9, [r0], #24 //store 2nd half tk for 16th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 18th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 18th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
eor r9, r9, #0x04000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 19th round
strd r6, r7, [r0], #8 //store 2nd half tk for 19th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 21th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00100000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 21th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x04100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 20th round
strd r8, r9, [r0], #24 //store 2nd half tk for 20th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 22th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 22th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 23th round
strd r6, r7, [r0], #8 //store 2nd half tk for 23th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 25th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 25th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 24th round
strd r8, r9, [r0], #24 //store 2nd half tk for 24th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 26th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 26th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 27th round
strd r6, r7, [r0], #8 //store 2nd half tk for 27th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 29th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 29th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 28th round
strd r8, r9, [r0], #24 //store 2nd half tk for 28th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 30th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 30th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 31th round
strd r6, r7, [r0], #8 //store 2nd half tk for 31th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 33th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 33th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 32th round
strd r8, r9, [r0], #24 //store 2nd half tk for 32th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 34th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 34th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 35th round
strd r6, r7, [r0], #8 //store 2nd half tk for 35th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 37th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 37th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 36th round
strd r8, r9, [r0], #24 //store 2nd half tk for 36th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 38th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 38th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 39th round
strd r6, r7, [r0], #8 //store 2nd half tk for 39th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0x3030
movt r10, #0x3030 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 40th round
strd r8, r9, [r0] //store 2nd half tk for 40th round
add.w sp, #4
pop {r0-r12, lr}
bx lr
/******************************************************************************
* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
* don't need more calculations as no LFSR is applied to TK1.
******************************************************************************/
@ void tkschedule_perm_tk1(u32* tk, const u8* key)
.global tkschedule_perm_tk1
.type tkschedule_perm_tk1,%function
.align 2
tkschedule_perm_tk1:
push {r0-r12, lr}
ldr.w r3, [r1, #8] //load tk1 (3rd word)
ldr.w r4, [r1, #4] //load tk1 (2nd word)
ldr.w r5, [r1, #12] //load tk1 (4th word)
ldr.w r2, [r1] //load tk1 (1st word)
movw r10, #0x0a0a
movt r10, #0x0a0a //r6 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk1
mov r6, r2 //move tk1 from r2-r5 to r6-r9
mov r7, r3 //move tk1 from r2-r5 to r6-r9
mov r8, r4 //move tk1 from r2-r5 to r6-r9
mov r9, r5 //move tk1 from r2-r5 to r6-r9
movw r2, #0xf0f0
movt r2, #0xf0f0 //r2<- 0xf0f0f0f0
and r11, r8, r2 //tk &= 0xf0f0f0f0 (3rd word)
and r12, r9, r2 //tk &= 0xf0f0f0f0 (4th word)
strd r11, r12, [r0], #8 //store 1st half tk for 1st round
and r11, r6, r2 //tk &= 0xf0f0f0f0 (1st word)
and r12, r7, r2 //tk &= 0xf0f0f0f0 (2nd word)
strd r11, r12, [r0], #8 //store 2nd half tk for 1st round
bl p2 //apply the permutation twice
movw r3, #0x0303
movt r3, #0x0303 //r3<- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 4 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 6 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 8 times
and r11, r2, r6 //ror and mask to match fixslicing
and r12, r2, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 9th round
and r11, r2, r8 //ror and mask to match fixslicing
and r12, r2, r9 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 9th round
bl p2 //apply the permutation 10
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 12 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 14 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0] //ror and masks to match fixslicing ---
pop {r0-r12, lr}
bx lr
.align 2
quadruple_round:
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #30 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #24
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #30
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #2
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #10 // --- mixcolumns 2 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #26
eor r2, r2, r8
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #26
eor r5, r5, r8
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #26
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #22
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
bx lr
/******************************************************************************
* Encrypt a single block using fixsliced SKINNY-128-128.
******************************************************************************/
@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext)
.global skinny128_384
.type skinny128_384,%function
.align 2
skinny128_384:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
movw r7, #0x3030
movt r7, #0x3030 //r7 <- 0x30303030
eor r12, r2, r2, lsr #3
and r12, r12, r6
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r6
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r6
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r6
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r7
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r7, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r7, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r7, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r7, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r7, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
eor r10, r3, r5, lsr #2
and r10, r10, r7, lsr #4
eor r3, r3, r10
eor r5, r5, r10, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
eor r10, r4, r5, lsr #4
and r10, r10, r7, lsr #4
eor r4, r4, r10
eor r5, r5, r10, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r10, r4, r3, lsr #2
and r10, r10, r7, lsr #2
eor r4, r4, r10
eor r3, r3, r10, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r10, r2, r5, lsr #6
and r10, r10, r7, lsr #4
eor r2, r2, r10
eor r5, r5, r10, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r10, r2, r3, lsr #4
and r10, r10, r7, lsr #2
eor r2, r2, r10
eor r3, r3, r10, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r10, r2, r4, lsr #2
and r10, r10, r7
eor r2, r2, r10
eor r4, r4, r10, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r10, r5, r5, lsr #3
and r10, r10, r6
eor r5, r5, r10
eor r5, r5, r10, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r10, r4, r4, lsr #3
and r10, r10, r6
eor r4, r4, r10
eor r4, r4, r10, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r10, r3, r3, lsr #3
and r10, r10, r6
eor r3, r3, r10
eor r3, r3, r10, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r10, r2, r2, lsr #3
and r10, r10, r6
eor r2, r2, r10
eor r2, r2, r10, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
#include "skinny128.h"
#include "tk_schedule.h"
#include "romulus.h"
#include <string.h>
#include <stdio.h>
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
int i;
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; //56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
//Handle the special case of no associated data
if (adlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else {
// Process all double blocks except the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
//Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) {
// Left-over complete double block
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) {
// Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) {
// Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else {
// Left-over partial single block
for(i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen;
SET_DOMAIN(tks, 0x1A);
}
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the plaintext -----------------
memset(tks.tk1, 0, KEYBYTES);
tks.tk1[0] = 0x01; //init the 56-bit LFSR counter
if (mlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else {
//process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (mlen > BLOCKBYTES) {
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
//pad and process the last block
UPDATE_CTR(tks.tk1);
if (mlen < BLOCKBYTES) {
for(i = 0; i < (int)mlen; i++) {
tmp = m[i]; //use of tmp variable just in case 'c = m'
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; //padding
SET_DOMAIN(tks, 0x15);
} else {
RHO(state,c,m);
SET_DOMAIN(tks, 0x14);
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
c += mlen;
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate the tag -----------------
G(state, state);
memcpy(c, state, TAGBYTES);
// ----------------- Generate the tag -----------------
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
int i;
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; //56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
//Handle the special case of no associated data
if (adlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else {
// Process all double blocks except the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
//Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) {
// Left-over complete double block
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) {
// Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) {
// Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else {
// Left-over partial single block
for(i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen;
SET_DOMAIN(tks, 0x1A);
}
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memset(tks.tk1, 0, KEYBYTES);
tks.tk1[0] = 0x01; //init the 56-bit LFSR counter
if (clen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
} else {
//process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (clen > BLOCKBYTES) {
RHO_INV(state,c,m);
UPDATE_CTR(tks.tk1);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
c += BLOCKBYTES;
m += BLOCKBYTES;
clen -= BLOCKBYTES;
}
//pad and process the last block
UPDATE_CTR(tks.tk1);
if (clen < BLOCKBYTES) {
for(i = 0; i < (int)clen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)clen; //padding
SET_DOMAIN(tks, 0x15);
} else {
RHO_INV(state,c,m);
SET_DOMAIN(tks, 0x14);
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384_plus(state, state, tks.rtk1, tks.rtk2_3);
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c[clen+i]; //constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
******************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 1
******************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 2
******************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 3
******************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/******************************************************************************
* Encryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
#define SKINNY128_384_ROUNDS 40
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
mixcolumns_0(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
mixcolumns_1(state); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
mixcolumns_2(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
mixcolumns_3(state); \
})
#endif // SKINNY128_H_
\ No newline at end of file
/******************************************************************************
* Implementation of the SKINNY tweakey schedule to match fixslicing.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
u32 rconst_32_bs[160] = {
0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff,
0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
0x00000010, 0x00000000, 0x00010010, 0xfffffbff
};
/******************************************************************************
* Pack the input into the bitsliced representation
* 24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
* 25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
* 26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
* 27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
******************************************************************************/
void packing(u32* out, const u8* in) {
u32 tmp;
LE_LOAD(out, in);
LE_LOAD(out + 1, in + 8);
LE_LOAD(out + 2, in + 4);
LE_LOAD(out + 3, in + 12);
SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[3], out[0], 0x03030303, 6);
SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[3], out[2], 0x03030303, 4);
SWAPMOVE(out[3], out[1], 0x03030303, 2);
}
/******************************************************************************
* Unpack the input to a byte-wise representation
******************************************************************************/
void unpacking(u8* out, u32 *in) {
u32 tmp;
SWAPMOVE(in[3], in[1], 0x03030303, 2);
SWAPMOVE(in[3], in[2], 0x03030303, 4);
SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[3], in[0], 0x03030303, 6);
SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
LE_STORE(out, in[0]);
LE_STORE(out + 8, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out + 12, in[3]);
}
/******************************************************************************
* 0 4 1 5
* 1 5 ---> 2 6
* 2 6 3 7
* 3 7 4 0
******************************************************************************/
void lfsr2_bs(u32* tk) {
u32 tmp;
tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[0] = tk[1];
tk[1] = tk[2];
tk[2] = tk[3];
tk[3] = tmp;
}
/******************************************************************************
* 0 4 7 3
* 1 5 ---> 0 4
* 2 6 1 5
* 3 7 2 6
******************************************************************************/
void lfsr3_bs(u32* tk) {
u32 tmp;
tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[3] = tk[2];
tk[2] = tk[1];
tk[1] = tk[0];
tk[0] = tmp;
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, twice
******************************************************************************/
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 4 times
******************************************************************************/
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 6 times
******************************************************************************/
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 8 times
******************************************************************************/
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 10 times
******************************************************************************/
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 12 times
******************************************************************************/
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 14 times
******************************************************************************/
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00;
tk[i] |= ROR(tmp,14) & 0x00cc0000;
tk[i] |= ROR(tmp,30) & 0xcc000000;
tk[i] |= ROR(tmp,16) & 0x000000ff;
tk[i] |= ROR(tmp,18) & 0x33003300;
}
}
/******************************************************************************
* Precompute all LFSRs on TK2
******************************************************************************/
void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
u32 tk2[4];
packing(tk2, key);
memcpy(tk, tk2, 16);
for(int i = 0 ; i < rounds; i+=2) {
lfsr2_bs(tk2);
memcpy(tk+i*4+4, tk2, 16);
}
}
/******************************************************************************
* Precompute all LFSRs on TK3
******************************************************************************/
void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
u32 tk3[4];
packing(tk3, key);
tk[0] ^= tk3[0];
tk[1] ^= tk3[1];
tk[2] ^= tk3[2];
tk[3] ^= tk3[3];
for(int i = 0 ; i < rounds; i+=2) {
lfsr3_bs(tk3);
tk[i*4+4] ^= tk3[0];
tk[i*4+5] ^= tk3[1];
tk[i*4+6] ^= tk3[2];
tk[i*4+7] ^= tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
for(int j = 0; j < 4; j++) {
tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
}
tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
}
if (test && (i+8 < rounds)) { //only if next loop iteration
tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
}
}
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
rtk[i*4+j] ^= rconst_32_bs[i*4+j];
}
}
/******************************************************************************
* Precompute RTK1.
******************************************************************************/
void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16);
}
\ No newline at end of file
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#endif // TK_SCHEDULE_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
#include "skinny128.h"
#include "romulus.h"
#include <string.h>
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
if (adlen == 0) { // Handle the special case of no AD
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process double blocks but the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else { // Left-over partial single block
for(int i = 0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
SET_DOMAIN(tks, 0x1A);
}
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the plaintext -----------------
memset(tks.tk1, 0x00, KEYBYTES/2);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (mlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (mlen > BLOCKBYTES) {
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
// Pad and process the last block
UPDATE_CTR(tks.tk1);
if (mlen < BLOCKBYTES) { // Last message single block is full
for(int i = 0; i < (int)mlen; i++) {
tmp = m[i]; // Use of tmp variable in case c = m
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; // Padding
SET_DOMAIN(tks, 0x15);
} else { // Last message single block is partial
RHO(state,c,m);
SET_DOMAIN(tks, 0x14);
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
c += mlen;
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate the tag -----------------
G(c,state);
// ----------------- Generate the tag -----------------
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; // Init 56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
if (adlen == 0) { // Handle the special case of no AD
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process double blocks except the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
// Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) { // Left-over complete double block
XOR_BLOCK(state, state, ad);
tkschedule_lfsr(tks.rtk, ad + BLOCKBYTES, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) { // Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
tkschedule_lfsr(tks.rtk, pad, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) { // Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else { // Left-over partial single block
for(int i = 0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen; // Padding
SET_DOMAIN(tks, 0x1A);
}
tkschedule_lfsr(tks.rtk, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(tks.rtk);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES/2);
tks.tk1[0] = 0x01; // Init the 56-bit LFSR counter
if (clen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
} else { // Process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (clen > BLOCKBYTES) {
RHO_INV(state,c,m);
UPDATE_CTR(tks.tk1);
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
c += BLOCKBYTES;
m += BLOCKBYTES;
clen -= BLOCKBYTES;
}
// Pad and process the last block
UPDATE_CTR(tks.tk1);
if (clen < BLOCKBYTES) { // Last message double block is partial
for(int i = 0; i < (int)clen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)clen; // Padding
SET_DOMAIN(tks, 0x15);
} else { // Last message double block is full
RHO_INV(state,c,m);
SET_DOMAIN(tks, 0x14);
}
tkschedule_perm_tk1(tks.rtk1, tks.tk1);
skinny128_384(state, tks.rtk, state, tks.rtk1);
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(int i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c[clen+i]; // Constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef struct {
u8 tk1[16]; //to manipulate tk1 in a byte-wise manner
u32 rtk1[32]; //to avoid recomputation of the tk schedule
u32 rtk[4*SKINNY128_384_ROUNDS];//all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_128_ROUNDS 40
#define SKINNY128_256_ROUNDS 48
#define SKINNY128_384_ROUNDS 56
extern void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
/*******************************************************************************
* ARM assembly implementation of fixsliced SKINNY-128-384.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
.syntax unified
.thumb
/*******************************************************************************
* applies P^2 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p2:
movw r1, #0xcc00
movt r1, #0xcc00 //r1 <- 0xcc00cc00
movw r10, #0xcc00
movt r10, #0x0033 //r10<- 0xcc000033
and r11, r1, r6, ror #14
bfi r11, r6, #16, #8
and r12, r6, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r6
orr r11, r11, r12, lsr #8
and r12, r6, #0x00cc0000
orr r6, r11, r12, lsr #18
and r11, r1, r7, ror #14
bfi r11, r7, #16, #8
and r12, r7, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r7
orr r11, r11, r12, lsr #8
and r12, r7, #0x00cc0000
orr r7, r11, r12, lsr #18
and r11, r1, r8, ror #14
bfi r11, r8, #16, #8
and r12, r8, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r8
orr r11, r11, r12, lsr #8
and r12, r8, #0x00cc0000
orr r8, r11, r12, lsr #18
and r11, r1, r9, ror #14
bfi r11, r9, #16, #8
and r12, r9, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r9
orr r11, r11, r12, lsr #8
and r12, r9, #0x00cc0000
orr r9, r11, r12, lsr #18
bx lr
/*******************************************************************************
* applies P^4 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p4:
str.w r14, [sp] //store r14 on the stack
movw r14, #0x00cc
movt r14, #0xcc00 //r14<- 0xcc0000cc
movw r12, #0xcc00
movt r12, #0x3300 //r12<- 0x3300cc00
movw r11, #0x00cc
movt r11, #0x00cc //r11<- 0x00cc00cc
and r10, r14, r6, ror #22
and r1, r12, r6, ror #16
orr r10, r10, r1
and r1, r6, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r6, r6, r1
orr r6, r10, r6, ror #24
and r10, r14, r7, ror #22
and r1, r12, r7, ror #16
orr r10, r10, r1
and r1, r7, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r7, r7, r1
orr r7, r10, r7, ror #24
and r10, r14, r8, ror #22
and r1, r12, r8, ror #16
orr r10, r10, r1
and r1, r8, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r8, r8, r1
orr r8, r10, r8, ror #24
and r10, r14, r9, ror #22
ldr.w r14, [sp] //restore r14
and r12, r12, r9, ror #16
orr r10, r10, r12
and r12, r9, r11
orr r10, r10, r12, lsr #2
movw r12, #0xcc33 //r1 <- 0x0000cc33
and r9, r9, r12
orr r9, r10, r9, ror #24
bx lr
/*******************************************************************************
* applies P^6 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p6:
movw r1, #0x3333 //r1 <- 0x00003333
movw r12, #0x00cc
movt r12, #0x3300 //r12<- 0x330000cc
and r10, r6, r1, ror #8 // --- permute r6 6 times
and r11, r12, r6, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r6, ror #10
orr r11, r11, r10
and r10, r6, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r6, #0x00003300
orr r6, r11, r10, lsl #2 // permute r6 6 times ---
and r10, r7, r1, ror #8 // --- permute r7 6 times
and r11, r12, r7, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r7, ror #10
orr r11, r11, r10
and r10, r7, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r7, #0x00003300
orr r7, r11, r10, lsl #2 // permute r7 6 times ---
and r10, r8, r1, ror #8 // --- permute r8 6 times
and r11, r12, r8, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r8, ror #10
orr r11, r11, r10
and r10, r8, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r8, #0x00003300
orr r8, r11, r10, lsl #2 // permute r8 6 times ---
and r10, r9, r1, ror #8 // --- permute r9 6 times
and r11, r12, r9, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r9, ror #10
orr r11, r11, r10
and r10, r9, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r9, #0x00003300 // permute r9 6 times ---
orr r9, r11, r10, lsl #2
bx lr
/*******************************************************************************
* applies P^8 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p8:
movw r12, #0x3333 //r12<- 0x00003333
movw r1, #0x0000
movt r1, #0x33cc //r1 <- 0x33cc0000
and r10, r6, r1 // --- permute r6 8 times
and r11, r1, r6, ror #8
orr r11, r11, r10, ror #24
and r10, r6, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r6, r12, lsl #8
orr r6, r11, r10, lsr #6 // permute r6 8 times ---
and r10, r7, r1 // --- permute r7 8 times
and r11, r1, r7, ror #8
orr r11, r11, r10, ror #24
and r10, r7, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r7, r12, lsl #8
orr r7, r11, r10, lsr #6 // permute r7 8 times ---
and r10, r8, r1 // --- permute r8 8 times
and r11, r1, r8, ror #8
orr r11, r11, r10, ror #24
and r10, r8, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r8, r12, lsl #8
orr r8, r11, r10, lsr #6 // permute r8 8 times ---
and r10, r9, r1 // --- permute r9 8 times
and r11, r1, r9, ror #8
orr r11, r11, r10, ror #24
and r10, r9, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r9, r12, lsl #8
orr r9, r11, r10, lsr #6 // permute r9 8 times ---
bx lr
/*******************************************************************************
* applies P^10 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p10:
movw r12, #0x0033
movt r12, #0x3300 //r12<- 0x33000033
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r10, r6, r1, ror #8 // --- permute r6 10 times
and r11, r12, r6, ror #26
orr r11, r11, r10, ror #8
and r10, r6, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r6, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r6, #0x0000cc00
orr r6, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r7, r1, ror #8 // --- permute r6 10 times
and r11, r12, r7, ror #26
orr r11, r11, r10, ror #8
and r10, r7, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r7, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r7, #0x0000cc00
orr r7, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r8, r1, ror #8 // --- permute r6 10 times
and r11, r12, r8, ror #26
orr r11, r11, r10, ror #8
and r10, r8, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r8, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r8, #0x0000cc00
orr r8, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r9, r1, ror #8 // --- permute r6 10 times
and r11, r12, r9, ror #26
orr r11, r11, r10, ror #8
and r10, r9, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r9, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r9, #0x0000cc00
orr r9, r11, r10, lsr #2 // permute r6 10 times ---
bx lr
/*******************************************************************************
* applies P^12 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p12:
str.w r14, [sp] //store r14 on the stack
movw r14, #0xcc33 //r14<- 0x0000cc33
movw r12, #0x00cc
movt r12, #0x00cc //r12<- 0x00cc00cc
movw r1, #0x3300
movt r1, #0xcc00 //r1 <- 0xcc003300
and r10, r14, r6, ror #8 // --- permute r6 12 times
and r11, r12, r6, ror #30
orr r11, r11, r10
and r10, r1, r6, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r6, r10, ror #8
orr r6, r11, r10, ror #10 // permute r6 12 times ---
and r10, r14, r7, ror #8 // --- permute r7 12 times
and r11, r12, r7, ror #30
orr r11, r11, r10
and r10, r1, r7, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r7, r10, ror #8
orr r7, r11, r10, ror #10 // permute r7 12 times ---
and r10, r14, r8, ror #8 // --- permute r8 12 times
and r11, r12, r8, ror #30
orr r11, r11, r10
and r10, r1, r8, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r8, r10, ror #8
orr r8, r11, r10, ror #10 // permute r8 12 times ---
and r10, r14, r9, ror #8 // --- permute r9 12 times
and r11, r12, r9, ror #30
orr r11, r11, r10
and r10, r1, r9, ror #16
ldr.w r14, [sp]
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r9, r10, ror #8
orr r9, r11, r10, ror #10 // permute r9 12 times ---
bx lr
/*******************************************************************************
* applies P^14 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p14:
movw r1, #0xcc00
movt r1, #0x0033 //r1 <- 0x0033cc00
movw r12, #0xcc00
movt r12, #0xcc00 //r12<- 0x33003300
and r10, r1, r6, ror #24 // --- permute r6 14 times
and r11, r6, #0x00000033
orr r11, r10, r11, ror #14
and r10, r6, #0x33000000
orr r11, r11, r10, ror #30
and r10, r6, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r6, r12
orr r6, r11, r10, ror #18 // permute r6 14 times ---
and r10, r1, r7, ror #24 // --- permute r7 14 times
and r11, r7, #0x00000033
orr r11, r10, r11, ror #14
and r10, r7, #0x33000000
orr r11, r11, r10, ror #30
and r10, r7, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r7, r12
orr r7, r11, r10, ror #18 // permute r7 14 times ---
and r10, r1, r8, ror #24 // --- permute r8 14 times
and r11, r8, #0x00000033
orr r11, r10, r11, ror #14
and r10, r8, #0x33000000
orr r11, r11, r10, ror #30
and r10, r8, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r8, r12
orr r8, r11, r10, ror #18 // permute r8 14 times ---
and r10, r1, r9, ror #24 // --- permute r9 14 times
and r11, r9, #0x00000033
orr r11, r10, r11, ror #14
and r10, r9, #0x33000000
orr r11, r11, r10, ror #30
and r10, r9, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r9, r12
orr r9, r11, r10, ror #18 // permute r9 14 times ---
bx lr
.align 2
packing:
eor r12, r2, r2, lsr #3
and r12, r12, r10
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r10
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r10
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r10
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r11
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r11, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r11, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r11, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r11, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r11, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
bx lr
/******************************************************************************
* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
* Performing both at the same time allows to save some memory accesses.
******************************************************************************/
@ void tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
.global tkschedule_lfsr
.type tkschedule_lfsr,%function
.align 2
tkschedule_lfsr:
push {r0-r12, r14}
ldr.w r3, [r1, #8] //load tk2 (3rd word)
ldr.w r4, [r1, #4] //load tk2 (2nd word)
ldr.w r5, [r1, #12] //load tk2 (4th word)
ldr.w r12, [r1] //load tk2 (1st word)
mov r1, r2 //move tk3 address in r1
mov r2, r12 //move 1st tk2 word in r2
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk2
mov r6, r2 //move tk2 from r2-r5 to r6-r9
mov r7, r3 //move tk2 from r2-r5 to r6-r9
mov r8, r4 //move tk2 from r2-r5 to r6-r9
mov r9, r5 //move tk2 from r2-r5 to r6-r9
ldr.w r3, [r1, #8] //load tk3 (3rd word)
ldr.w r4, [r1, #4] //load tk3 (2nd word)
ldr.w r5, [r1, #12] //load tk3 (4th) word)
ldr.w r2, [r1] //load tk3 (1st) word)
bl packing //pack tk3
eor r10, r10, r10, lsl #4 //r10<- 0xaaaaaaaa
ldr.w r1, [sp, #12] //load loop counter in r1
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #8 //store in tk
loop:
and r12, r8, r10 // --- apply LFSR2 to tk2
eor r12, r12, r6
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r6, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r3, r10 // --- apply LFSR3 to tk3
eor r12, r5, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r5, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r5, r7 //tk2 ^ tk3 (1st word)
eor r12, r2, r8 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r3, r9 //tk2 ^ tk3 (3rd word)
eor r12, r4, r6 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r9, r10 // --- apply LFSR2 to tk2
eor r12, r12, r7
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r7, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r2, r10 // --- apply LFSR3 to tk3
eor r12, r4, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r4, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r4, r8 //tk2 ^ tk3 (1st word)
eor r12, r5, r9 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r2, r6 //tk2 ^ tk3 (3rd word)
eor r12, r3, r7 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r6, r10 // --- apply LFSR2 to tk2
eor r12, r12, r8
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r8, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r5, r10 // --- apply LFSR3 to tk3
eor r12, r3, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r3, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r3, r9 //tk2 ^ tk3 (1st word)
eor r12, r4, r6 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r5, r7 //tk2 ^ tk3 (3rd word)
eor r12, r2, r8 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r7, r10 // --- apply LFSR2 to tk2
eor r12, r12, r9
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r9, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r4, r10 // --- apply LFSR3 to tk3
eor r12, r2, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r2, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
subs.w r1, r1, #8 //decrease loop counter by 8
bne loop
pop {r0-r12, r14}
bx lr
/******************************************************************************
* Applies the permutation P and add the round constants to all round tweakeys.
******************************************************************************/
@ void tkschedule_perm(u32* tk)
.global tkschedule_perm
.type tkschedule_perm,%function
.align 2
tkschedule_perm:
push {r0-r12, lr}
sub.w sp, #4 //to store r14 in subroutines
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r6, r6, r10 //tk &= 0xf0f0f0f0 (1st word)
and r7, r7, r10 //tk &= 0xf0f0f0f0 (2nd word)
and r8, r8, r10 //tk &= 0xf0f0f0f0 (3rd word)
and r9, r9, r10 //tk &= 0xf0f0f0f0 (4th word)
eor r8, r8, #0x00000004 //add rconst
eor r9, r9, #0x00000040 //add rconst
mvn r9, r9 //to remove a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 1st round
strd r6, r7, [r0], #8 //store 2nd half tk for 1st round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 3rd round
strd r6, r7, [r0], #8 //store 2nd half tk for 3rd round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 5th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 5th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 4th round
strd r8, r9, [r0], #24 //store 2nd half tk for 4th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 7th round
strd r6, r7, [r0], #8 //store 2nd half tk for 7th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 9th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000054 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 9th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000410
eor r9, r9, #0x00000410
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 8th round
strd r8, r9, [r0], #24 //store 2nd half tk for 8th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r6, r6, #0x04000000 //add rconst
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 11th round
strd r6, r7, [r0], #8 //store 2nd half tk for 11th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 13th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00140000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 13th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 12th round
strd r8, r9, [r0], #24 //store 2nd half tk for 12th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
eor r9, r9, #0x00000400 //add const
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 15th round
strd r6, r7, [r0], #8 //store 2nd half tk for 15th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 17th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 17th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 16th round
strd r8, r9, [r0], #24 //store 2nd half tk for 16th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 18th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 18th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
eor r9, r9, #0x04000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 19th round
strd r6, r7, [r0], #8 //store 2nd half tk for 19th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 21th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00100000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 21th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x04100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 20th round
strd r8, r9, [r0], #24 //store 2nd half tk for 20th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 22th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 22th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 23th round
strd r6, r7, [r0], #8 //store 2nd half tk for 23th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 25th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 25th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 24th round
strd r8, r9, [r0], #24 //store 2nd half tk for 24th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 26th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 26th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 27th round
strd r6, r7, [r0], #8 //store 2nd half tk for 27th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 29th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 29th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 28th round
strd r8, r9, [r0], #24 //store 2nd half tk for 28th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 30th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 30th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 31th round
strd r6, r7, [r0], #8 //store 2nd half tk for 31th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 33th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 33th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 32th round
strd r8, r9, [r0], #24 //store 2nd half tk for 32th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 34th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 34th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 35th round
strd r6, r7, [r0], #8 //store 2nd half tk for 35th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 37th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 37th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 36th round
strd r8, r9, [r0], #24 //store 2nd half tk for 36th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 38th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 38th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 39th round
strd r6, r7, [r0], #8 //store 2nd half tk for 39th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 41th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000010 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 41th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 40th round
strd r8, r9, [r0], #24 //store 2nd half tk for 40th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 42th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 42th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 43th round
strd r6, r7, [r0], #8 //store 2nd half tk for 43th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 45th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00040000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 45th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 44th round
strd r8, r9, [r0], #24 //store 2nd half tk for 44th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 46th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 46th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 47th round
strd r6, r7, [r0], #8 //store 2nd half tk for 47th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 49th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 49th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 48th round
strd r8, r9, [r0], #24 //store 2nd half tk for 48th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 50th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000140 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 50th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 51th round
strd r6, r7, [r0], #8 //store 2nd half tk for 51th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 53th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00040000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 53th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 52th round
strd r8, r9, [r0], #24 //store 2nd half tk for 52th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 54th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 54th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 55th round
strd r6, r7, [r0], #8 //store 2nd half tk for 55th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0x3030
movt r10, #0x3030 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 56th round
strd r8, r9, [r0], #24 //store 2nd half tk for 56th round
add.w sp, #4
pop {r0-r12, lr}
bx lr
/******************************************************************************
* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
* don't need more calculations as no LFSR is applied to TK1.
******************************************************************************/
@ void tkschedule_perm_tk1(u32* tk, const u8* key)
.global tkschedule_perm_tk1
.type tkschedule_perm_tk1,%function
.align 2
tkschedule_perm_tk1:
push {r0-r12, lr}
ldr.w r3, [r1, #8] //load tk1 (3rd word)
ldr.w r4, [r1, #4] //load tk1 (2nd word)
ldr.w r5, [r1, #12] //load tk1 (4th word)
ldr.w r2, [r1] //load tk1 (1st word)
movw r10, #0x0a0a
movt r10, #0x0a0a //r6 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk1
mov r6, r2 //move tk1 from r2-r5 to r6-r9
mov r7, r3 //move tk1 from r2-r5 to r6-r9
mov r8, r4 //move tk1 from r2-r5 to r6-r9
mov r9, r5 //move tk1 from r2-r5 to r6-r9
movw r2, #0xf0f0
movt r2, #0xf0f0 //r2<- 0xf0f0f0f0
and r11, r8, r2 //tk &= 0xf0f0f0f0 (3rd word)
and r12, r9, r2 //tk &= 0xf0f0f0f0 (4th word)
strd r11, r12, [r0], #8 //store 1st half tk for 1st round
and r11, r6, r2 //tk &= 0xf0f0f0f0 (1st word)
and r12, r7, r2 //tk &= 0xf0f0f0f0 (2nd word)
strd r11, r12, [r0], #8 //store 2nd half tk for 1st round
bl p2 //apply the permutation twice
movw r3, #0x0303
movt r3, #0x0303 //r3<- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 4 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 6 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 8 times
and r11, r2, r6 //ror and mask to match fixslicing
and r12, r2, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 9th round
and r11, r2, r8 //ror and mask to match fixslicing
and r12, r2, r9 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 9th round
bl p2 //apply the permutation 10
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 12 times
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 14 times
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0] //ror and masks to match fixslicing ---
pop {r0-r12, lr}
bx lr
.align 2
quadruple_round:
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #30 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #24
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #30
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #2
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #10 // --- mixcolumns 2 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #26
eor r2, r2, r8
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #26
eor r5, r5, r8
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #26
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #22
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
bx lr
/******************************************************************************
* Encrypt a single block using fixsliced SKINNY-128-128.
******************************************************************************/
@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
.global skinny128_384
.type skinny128_384,%function
.align 2
skinny128_384:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
movw r7, #0x3030
movt r7, #0x3030 //r7 <- 0x30303030
eor r12, r2, r2, lsr #3
and r12, r12, r6
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r6
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r6
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r6
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r7
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r7, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r7, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r7, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r7, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r7, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #128 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
eor r10, r3, r5, lsr #2
and r10, r10, r7, lsr #4
eor r3, r3, r10
eor r5, r5, r10, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
eor r10, r4, r5, lsr #4
and r10, r10, r7, lsr #4
eor r4, r4, r10
eor r5, r5, r10, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r10, r4, r3, lsr #2
and r10, r10, r7, lsr #2
eor r4, r4, r10
eor r3, r3, r10, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r10, r2, r5, lsr #6
and r10, r10, r7, lsr #4
eor r2, r2, r10
eor r5, r5, r10, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r10, r2, r3, lsr #4
and r10, r10, r7, lsr #2
eor r2, r2, r10
eor r3, r3, r10, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r10, r2, r4, lsr #2
and r10, r10, r7
eor r2, r2, r10
eor r4, r4, r10, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r10, r5, r5, lsr #3
and r10, r10, r6
eor r5, r5, r10
eor r5, r5, r10, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r10, r4, r4, lsr #3
and r10, r10, r6
eor r4, r4, r10
eor r4, r4, r10, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r10, r3, r3, lsr #3
and r10, r10, r6
eor r3, r3, r10
eor r3, r3, r10, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r10, r2, r2, lsr #3
and r10, r10, r6
eor r2, r2, r10
eor r2, r2, r10, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
#include "skinny128.h"
#include "tk_schedule.h"
#include "romulus.h"
#include <string.h>
#include <stdio.h>
//Encryption and authentication using Romulus-N1
int crypto_aead_encrypt
(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
int i;
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; //56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
//Handle the special case of no associated data
if (adlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else {
// Process all double blocks except the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
//Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) {
// Left-over complete double block
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) {
// Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) {
// Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else {
// Left-over partial single block
for(i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen;
SET_DOMAIN(tks, 0x1A);
}
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the plaintext -----------------
memset(tks.tk1, 0, KEYBYTES);
tks.tk1[0] = 0x01; //init the 56-bit LFSR counter
if (mlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else {
//process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (mlen > BLOCKBYTES) {
RHO(state,c,m);
UPDATE_CTR(tks.tk1);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
c += BLOCKBYTES;
m += BLOCKBYTES;
mlen -= BLOCKBYTES;
}
//pad and process the last block
UPDATE_CTR(tks.tk1);
if (mlen < BLOCKBYTES) {
for(i = 0; i < (int)mlen; i++) {
tmp = m[i]; //use of tmp variable just in case 'c = m'
c[i] = m[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= (u8)tmp;
}
state[15] ^= (u8)mlen; //padding
SET_DOMAIN(tks, 0x15);
} else {
RHO(state,c,m);
SET_DOMAIN(tks, 0x14);
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
c += mlen;
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate the tag -----------------
G(state,state);
memcpy(c, state, TAGBYTES);
// ----------------- Generate the tag -----------------
return 0;
}
//Decryption and tag verification using Romulus-N1
int crypto_aead_decrypt
(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
int i;
u32 tmp;
skinny_128_384_tks tks;
u8 state[BLOCKBYTES], pad[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
*mlen = clen - TAGBYTES;
memset(tks.tk1, 0x00, KEYBYTES);
memset(state, 0x00, BLOCKBYTES);
tks.tk1[0] = 0x01; //56-bit LFSR counter
// ----------------- Initialization -----------------
// ----------------- Process the associated data -----------------
//Handle the special case of no associated data
if (adlen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else {
// Process all double blocks except the last
SET_DOMAIN(tks, 0x08);
while (adlen > 2*BLOCKBYTES) {
UPDATE_CTR(tks.tk1);
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
ad += 2*BLOCKBYTES;
adlen -= 2*BLOCKBYTES;
}
//Pad and process the left-over blocks
UPDATE_CTR(tks.tk1);
if (adlen == 2*BLOCKBYTES) {
// Left-over complete double block
XOR_BLOCK(state, state, ad);
precompute_rtk2_3(tks.rtk2_3, ad + BLOCKBYTES, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x18);
} else if (adlen > BLOCKBYTES) {
// Left-over partial double block
adlen -= BLOCKBYTES;
XOR_BLOCK(state, state, ad);
memcpy(pad, ad + BLOCKBYTES, adlen);
memset(pad + adlen, 0x00, 15 - adlen);
pad[15] = adlen;
precompute_rtk2_3(tks.rtk2_3, pad, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x1A);
} else if (adlen == BLOCKBYTES) {
// Left-over complete single block
XOR_BLOCK(state, state, ad);
SET_DOMAIN(tks, 0x18);
} else {
// Left-over partial single block
for(i =0; i < (int)adlen; i++)
state[i] ^= ad[i];
state[15] ^= adlen;
SET_DOMAIN(tks, 0x1A);
}
precompute_rtk2_3(tks.rtk2_3, npub, k);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
}
// ----------------- Process the associated data -----------------
// ----------------- Process the ciphertext -----------------
clen -= TAGBYTES;
memset(tks.tk1, 0, KEYBYTES);
tks.tk1[0] = 0x01; //init the 56-bit LFSR counter
if (clen == 0) {
UPDATE_CTR(tks.tk1);
SET_DOMAIN(tks, 0x15);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
} else {
//process all blocks except the last
SET_DOMAIN(tks, 0x04);
while (clen > BLOCKBYTES) {
RHO_INV(state,c,m);
UPDATE_CTR(tks.tk1);
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
c += BLOCKBYTES;
m += BLOCKBYTES;
clen -= BLOCKBYTES;
}
//pad and process the last block
UPDATE_CTR(tks.tk1);
if (clen < BLOCKBYTES) {
for(i = 0; i < (int)clen; i++) {
m[i] = c[i] ^ (state[i] >> 1) ^ (state[i] & 0x80) ^ (state[i] << 7);
state[i] ^= m[i];
}
state[15] ^= (u8)clen; //padding
SET_DOMAIN(tks, 0x15);
} else {
RHO_INV(state,c,m);
SET_DOMAIN(tks, 0x14);
}
precompute_rtk1(tks.rtk1, tks.tk1);
skinny128_384(state, state, tks.rtk1, tks.rtk2_3);
}
// ----------------- Process the plaintext -----------------
// ----------------- Generate and check the tag -----------------
G(state,state);
tmp = 0;
for(i = 0; i < TAGBYTES; i++)
tmp |= state[i] ^ c[clen+i]; //constant-time tag comparison
// ----------------- Generate and check the tag -----------------
return tmp;
}
\ No newline at end of file
#ifndef ROMULUSN1_H_
#define ROMULUSN1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef struct {
u8 tk1[16]; //to manipulate tk1 byte-wise
u32 rtk1[4*16]; //to avoid tk schedule recomputations
u32 rtk2_3[4*SKINNY128_384_ROUNDS]; //all round tweakeys
} skinny_128_384_tks;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(tks, domain) ((tks).tk1[7] = (domain))
//G as defined in the Romulus specification in a 32-bit word-wise manner
#define G(x,y) ({ \
tmp = ((u32*)(y))[0]; \
((u32*)(x))[0] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[1]; \
((u32*)(x))[1] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[2]; \
((u32*)(x))[2] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
tmp = ((u32*)(y))[3]; \
((u32*)(x))[3] = (tmp >> 1 & 0x7f7f7f7f) ^ ((tmp ^ (tmp << 7)) & 0x80808080); \
})
//update the counter in tk1 in a 32-bit word-wise manner
#define UPDATE_CTR(tk1) ({ \
tmp = ((u32*)(tk1))[1]; \
((u32*)(tk1))[1] = (tmp << 1) & 0x00ffffff; \
((u32*)(tk1))[1] |= (((u32*)(tk1))[0] >> 31); \
((u32*)(tk1))[1] |= tmp & 0xff000000; \
((u32*)(tk1))[0] <<= 1; \
if ((tmp >> 23) & 0x01) \
((u32*)(tk1))[0] ^= 0x95; \
})
//x <- y ^ z for 128-bit blocks
#define XOR_BLOCK(x,y,z) ({ \
((u32*)(x))[0] = ((u32*)(y))[0] ^ ((u32*)(z))[0]; \
((u32*)(x))[1] = ((u32*)(y))[1] ^ ((u32*)(z))[1]; \
((u32*)(x))[2] = ((u32*)(y))[2] ^ ((u32*)(z))[2]; \
((u32*)(x))[3] = ((u32*)(y))[3] ^ ((u32*)(z))[3]; \
})
//Rho as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO(x,y,z) ({ \
G(pad,x); \
XOR_BLOCK(y, pad, z); \
XOR_BLOCK(x, x, z); \
})
//Rho inverse as defined in the Romulus specification
//use pad as a tmp variable in case y = z
#define RHO_INV(x, y, z) ({ \
G(pad, x); \
XOR_BLOCK(z, pad, y); \
XOR_BLOCK(x, x, z); \
})
#endif // ROMULUSN1_H_
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
******************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 1
******************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 2
******************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 3
******************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/******************************************************************************
* Encryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 56
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
mixcolumns_0(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
mixcolumns_1(state); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
mixcolumns_2(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
mixcolumns_3(state); \
})
void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
#endif // SKINNY128_H_
\ No newline at end of file
/******************************************************************************
* Implementation of the SKINNY tweakey schedule to match fixslicing.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
u32 rconst_32_bs[224] = {
0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff,
0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
0x00000010, 0x00000000, 0x00010010, 0xfffffbff, 0x00000014, 0xffffffef,
0x00000000, 0x00000040, 0x00000100, 0x00000000, 0x10000040, 0xfffffeff,
0x44000000, 0xfffffeff, 0x00000000, 0x00000000, 0x00000000, 0x00100000,
0x04000001, 0xffffffff, 0x00040000, 0xffffffff, 0x00400000, 0x00000000,
0x00000000, 0x00000000, 0x00001000, 0xfebfffff, 0x01004400, 0xffffffff,
0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00010000, 0xffffffff,
0x00000004, 0xffffffbf, 0x00000040, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffebf, 0x44000100, 0xffffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00000001, 0xffffffff, 0x00040000, 0xffafffff,
0x00400000, 0x00000000, 0x00000000, 0x00000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfffffbff, 0x00000000, 0x00000400, 0x00000010, 0x00000000,
0x00010010, 0xffffffff
};
/******************************************************************************
* Pack the input into the bitsliced representation
* 24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
* 25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
* 26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
* 27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
******************************************************************************/
void packing(u32* out, const u8* in) {
u32 tmp;
LE_LOAD(out, in);
LE_LOAD(out + 1, in + 8);
LE_LOAD(out + 2, in + 4);
LE_LOAD(out + 3, in + 12);
SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[3], out[0], 0x03030303, 6);
SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[3], out[2], 0x03030303, 4);
SWAPMOVE(out[3], out[1], 0x03030303, 2);
}
/******************************************************************************
* Unpack the input to a byte-wise representation
******************************************************************************/
void unpacking(u8* out, u32 *in) {
u32 tmp;
SWAPMOVE(in[3], in[1], 0x03030303, 2);
SWAPMOVE(in[3], in[2], 0x03030303, 4);
SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[3], in[0], 0x03030303, 6);
SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
LE_STORE(out, in[0]);
LE_STORE(out + 8, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out + 12, in[3]);
}
/******************************************************************************
* 0 4 1 5
* 1 5 ---> 2 6
* 2 6 3 7
* 3 7 4 0
******************************************************************************/
void lfsr2_bs(u32* tk) {
u32 tmp;
tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[0] = tk[1];
tk[1] = tk[2];
tk[2] = tk[3];
tk[3] = tmp;
}
/******************************************************************************
* 0 4 7 3
* 1 5 ---> 0 4
* 2 6 1 5
* 3 7 2 6
******************************************************************************/
void lfsr3_bs(u32* tk) {
u32 tmp;
tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[3] = tk[2];
tk[2] = tk[1];
tk[1] = tk[0];
tk[0] = tmp;
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, twice
******************************************************************************/
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 4 times
******************************************************************************/
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 6 times
******************************************************************************/
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 8 times
******************************************************************************/
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 10 times
******************************************************************************/
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 12 times
******************************************************************************/
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 14 times
******************************************************************************/
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00;
tk[i] |= ROR(tmp,14) & 0x00cc0000;
tk[i] |= ROR(tmp,30) & 0xcc000000;
tk[i] |= ROR(tmp,16) & 0x000000ff;
tk[i] |= ROR(tmp,18) & 0x33003300;
}
}
/******************************************************************************
* Precompute all LFSRs on TK2
******************************************************************************/
void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
u32 tk2[4];
packing(tk2, key);
memcpy(tk, tk2, 16);
for(int i = 0 ; i < rounds; i+=2) {
lfsr2_bs(tk2);
memcpy(tk+i*4+4, tk2, 16);
}
}
/******************************************************************************
* Precompute all LFSRs on TK3
******************************************************************************/
void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
u32 tk3[4];
packing(tk3, key);
tk[0] ^= tk3[0];
tk[1] ^= tk3[1];
tk[2] ^= tk3[2];
tk[3] ^= tk3[3];
for(int i = 0 ; i < rounds; i+=2) {
lfsr3_bs(tk3);
tk[i*4+4] ^= tk3[0];
tk[i*4+5] ^= tk3[1];
tk[i*4+6] ^= tk3[2];
tk[i*4+7] ^= tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
for(int j = 0; j < 4; j++) {
tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
}
tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
}
if (test && (i+8 < rounds)) { //only if next loop iteration
tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
}
}
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
rtk[i*4+j] ^= rconst_32_bs[i*4+j];
}
}
/******************************************************************************
* Precompute RTK1.
******************************************************************************/
void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16);
}
\ No newline at end of file
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#endif // TK_SCHEDULE_H_
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384(c, rtk2_3, m, rtk1);
xor_block(sum, m); // sum for tag computation
mlen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update lfsr for next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
for(i = 0; i < mlen; i++)
sum[i] ^= m[i]; // sum for tag computation
sum[i] ^= 0x80; // padding
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(auth, rtk2_3, auth, rtk1); // encrypt 'auth' = 0^16
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
memcpy(c, sum, TAGBYTES);
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(sum, rtk2_3, ad, rtk1); // use 'sum' as tmp array
xor_block(auth, sum);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
tkschedule_perm_tk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] = 0x80; // padding
skinny128_384(tmp, rtk2_3, tmp, rtk1);
xor_block(auth, tmp);
}
xor_block(c, auth); // XOR for tag computation
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384_inv(m, rtk2_3, c, rtk1);
xor_block(sum, m); // sum for tag computation
clen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update LFSR for the next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(auth, rtk2_3, auth, rtk1);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
c += clen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(tmp + BLOCKBYTES, rtk2_3, ad, rtk1);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
tkschedule_perm_tk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] ^= 0x80; // padding
skinny128_384(tmp, rtk2_3, tmp, rtk1);
xor_block(auth, tmp);
}
xor_block(sum, auth); // XOR for tag computation
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 40
extern void skinny128_384(u8* ctext, const u32* rtk2_3, const u8* ptext, const u32* rtk1);
extern void skinny128_384_inv(u8* ptext, const u32* rtk2_3, const u8* ctext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk2_3, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk2_3);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
/*******************************************************************************
* ARM assembly implementation of fixsliced SKINNY-128-384.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
.syntax unified
.thumb
/*******************************************************************************
* applies P^2 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p2:
movw r1, #0xcc00
movt r1, #0xcc00 //r1 <- 0xcc00cc00
movw r10, #0xcc00
movt r10, #0x0033 //r10<- 0xcc000033
and r11, r1, r6, ror #14
bfi r11, r6, #16, #8
and r12, r6, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r6
orr r11, r11, r12, lsr #8
and r12, r6, #0x00cc0000
orr r6, r11, r12, lsr #18
and r11, r1, r7, ror #14
bfi r11, r7, #16, #8
and r12, r7, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r7
orr r11, r11, r12, lsr #8
and r12, r7, #0x00cc0000
orr r7, r11, r12, lsr #18
and r11, r1, r8, ror #14
bfi r11, r8, #16, #8
and r12, r8, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r8
orr r11, r11, r12, lsr #8
and r12, r8, #0x00cc0000
orr r8, r11, r12, lsr #18
and r11, r1, r9, ror #14
bfi r11, r9, #16, #8
and r12, r9, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r9
orr r11, r11, r12, lsr #8
and r12, r9, #0x00cc0000
orr r9, r11, r12, lsr #18
bx lr
/*******************************************************************************
* applies P^4 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p4:
str.w r14, [sp] //store r14 on the stack
movw r14, #0x00cc
movt r14, #0xcc00 //r14<- 0xcc0000cc
movw r12, #0xcc00
movt r12, #0x3300 //r12<- 0x3300cc00
movw r11, #0x00cc
movt r11, #0x00cc //r11<- 0x00cc00cc
and r10, r14, r6, ror #22
and r1, r12, r6, ror #16
orr r10, r10, r1
and r1, r6, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r6, r6, r1
orr r6, r10, r6, ror #24
and r10, r14, r7, ror #22
and r1, r12, r7, ror #16
orr r10, r10, r1
and r1, r7, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r7, r7, r1
orr r7, r10, r7, ror #24
and r10, r14, r8, ror #22
and r1, r12, r8, ror #16
orr r10, r10, r1
and r1, r8, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r8, r8, r1
orr r8, r10, r8, ror #24
and r10, r14, r9, ror #22
ldr.w r14, [sp] //restore r14
and r12, r12, r9, ror #16
orr r10, r10, r12
and r12, r9, r11
orr r10, r10, r12, lsr #2
movw r12, #0xcc33 //r1 <- 0x0000cc33
and r9, r9, r12
orr r9, r10, r9, ror #24
bx lr
/*******************************************************************************
* applies P^6 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p6:
movw r1, #0x3333 //r1 <- 0x00003333
movw r12, #0x00cc
movt r12, #0x3300 //r12<- 0x330000cc
and r10, r6, r1, ror #8 // --- permute r6 6 times
and r11, r12, r6, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r6, ror #10
orr r11, r11, r10
and r10, r6, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r6, #0x00003300
orr r6, r11, r10, lsl #2 // permute r6 6 times ---
and r10, r7, r1, ror #8 // --- permute r7 6 times
and r11, r12, r7, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r7, ror #10
orr r11, r11, r10
and r10, r7, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r7, #0x00003300
orr r7, r11, r10, lsl #2 // permute r7 6 times ---
and r10, r8, r1, ror #8 // --- permute r8 6 times
and r11, r12, r8, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r8, ror #10
orr r11, r11, r10
and r10, r8, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r8, #0x00003300
orr r8, r11, r10, lsl #2 // permute r8 6 times ---
and r10, r9, r1, ror #8 // --- permute r9 6 times
and r11, r12, r9, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r9, ror #10
orr r11, r11, r10
and r10, r9, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r9, #0x00003300 // permute r9 6 times ---
orr r9, r11, r10, lsl #2
bx lr
/*******************************************************************************
* applies P^8 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p8:
movw r12, #0x3333 //r12<- 0x00003333
movw r1, #0x0000
movt r1, #0x33cc //r1 <- 0x33cc0000
and r10, r6, r1 // --- permute r6 8 times
and r11, r1, r6, ror #8
orr r11, r11, r10, ror #24
and r10, r6, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r6, r12, lsl #8
orr r6, r11, r10, lsr #6 // permute r6 8 times ---
and r10, r7, r1 // --- permute r7 8 times
and r11, r1, r7, ror #8
orr r11, r11, r10, ror #24
and r10, r7, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r7, r12, lsl #8
orr r7, r11, r10, lsr #6 // permute r7 8 times ---
and r10, r8, r1 // --- permute r8 8 times
and r11, r1, r8, ror #8
orr r11, r11, r10, ror #24
and r10, r8, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r8, r12, lsl #8
orr r8, r11, r10, lsr #6 // permute r8 8 times ---
and r10, r9, r1 // --- permute r9 8 times
and r11, r1, r9, ror #8
orr r11, r11, r10, ror #24
and r10, r9, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r9, r12, lsl #8
orr r9, r11, r10, lsr #6 // permute r9 8 times ---
bx lr
/*******************************************************************************
* applies P^10 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p10:
movw r12, #0x0033
movt r12, #0x3300 //r12<- 0x33000033
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r10, r6, r1, ror #8 // --- permute r6 10 times
and r11, r12, r6, ror #26
orr r11, r11, r10, ror #8
and r10, r6, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r6, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r6, #0x0000cc00
orr r6, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r7, r1, ror #8 // --- permute r6 10 times
and r11, r12, r7, ror #26
orr r11, r11, r10, ror #8
and r10, r7, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r7, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r7, #0x0000cc00
orr r7, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r8, r1, ror #8 // --- permute r6 10 times
and r11, r12, r8, ror #26
orr r11, r11, r10, ror #8
and r10, r8, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r8, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r8, #0x0000cc00
orr r8, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r9, r1, ror #8 // --- permute r6 10 times
and r11, r12, r9, ror #26
orr r11, r11, r10, ror #8
and r10, r9, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r9, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r9, #0x0000cc00
orr r9, r11, r10, lsr #2 // permute r6 10 times ---
bx lr
/*******************************************************************************
* applies P^12 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p12:
str.w r14, [sp] //store r14 on the stack
movw r14, #0xcc33 //r14<- 0x0000cc33
movw r12, #0x00cc
movt r12, #0x00cc //r12<- 0x00cc00cc
movw r1, #0x3300
movt r1, #0xcc00 //r1 <- 0xcc003300
and r10, r14, r6, ror #8 // --- permute r6 12 times
and r11, r12, r6, ror #30
orr r11, r11, r10
and r10, r1, r6, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r6, r10, ror #8
orr r6, r11, r10, ror #10 // permute r6 12 times ---
and r10, r14, r7, ror #8 // --- permute r7 12 times
and r11, r12, r7, ror #30
orr r11, r11, r10
and r10, r1, r7, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r7, r10, ror #8
orr r7, r11, r10, ror #10 // permute r7 12 times ---
and r10, r14, r8, ror #8 // --- permute r8 12 times
and r11, r12, r8, ror #30
orr r11, r11, r10
and r10, r1, r8, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r8, r10, ror #8
orr r8, r11, r10, ror #10 // permute r8 12 times ---
and r10, r14, r9, ror #8 // --- permute r9 12 times
and r11, r12, r9, ror #30
orr r11, r11, r10
and r10, r1, r9, ror #16
ldr.w r14, [sp]
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r9, r10, ror #8
orr r9, r11, r10, ror #10 // permute r9 12 times ---
bx lr
/*******************************************************************************
* applies P^14 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p14:
movw r1, #0xcc00
movt r1, #0x0033 //r1 <- 0x0033cc00
movw r12, #0xcc00
movt r12, #0xcc00 //r12<- 0x33003300
and r10, r1, r6, ror #24 // --- permute r6 14 times
and r11, r6, #0x00000033
orr r11, r10, r11, ror #14
and r10, r6, #0x33000000
orr r11, r11, r10, ror #30
and r10, r6, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r6, r12
orr r6, r11, r10, ror #18 // permute r6 14 times ---
and r10, r1, r7, ror #24 // --- permute r7 14 times
and r11, r7, #0x00000033
orr r11, r10, r11, ror #14
and r10, r7, #0x33000000
orr r11, r11, r10, ror #30
and r10, r7, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r7, r12
orr r7, r11, r10, ror #18 // permute r7 14 times ---
and r10, r1, r8, ror #24 // --- permute r8 14 times
and r11, r8, #0x00000033
orr r11, r10, r11, ror #14
and r10, r8, #0x33000000
orr r11, r11, r10, ror #30
and r10, r8, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r8, r12
orr r8, r11, r10, ror #18 // permute r8 14 times ---
and r10, r1, r9, ror #24 // --- permute r9 14 times
and r11, r9, #0x00000033
orr r11, r10, r11, ror #14
and r10, r9, #0x33000000
orr r11, r11, r10, ror #30
and r10, r9, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r9, r12
orr r9, r11, r10, ror #18 // permute r9 14 times ---
bx lr
.align 2
packing:
eor r12, r2, r2, lsr #3
and r12, r12, r10
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r10
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r10
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r10
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r11
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r11, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r11, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r11, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r11, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r11, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
bx lr
.align 2
unpacking:
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
eor r10, r3, r5, lsr #2
and r10, r10, r7, lsr #4
eor r3, r3, r10
eor r5, r5, r10, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
eor r10, r4, r5, lsr #4
and r10, r10, r7, lsr #4
eor r4, r4, r10
eor r5, r5, r10, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r10, r4, r3, lsr #2
and r10, r10, r7, lsr #2
eor r4, r4, r10
eor r3, r3, r10, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r10, r2, r5, lsr #6
and r10, r10, r7, lsr #4
eor r2, r2, r10
eor r5, r5, r10, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r10, r2, r3, lsr #4
and r10, r10, r7, lsr #2
eor r2, r2, r10
eor r3, r3, r10, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r10, r2, r4, lsr #2
and r10, r10, r7
eor r2, r2, r10
eor r4, r4, r10, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r10, r5, r5, lsr #3
and r10, r10, r6
eor r5, r5, r10
eor r5, r5, r10, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r10, r4, r4, lsr #3
and r10, r10, r6
eor r4, r4, r10
eor r4, r4, r10, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r10, r3, r3, lsr #3
and r10, r10, r6
eor r3, r3, r10
eor r3, r3, r10, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r10, r2, r2, lsr #3
and r10, r10, r6
eor r2, r2, r10
eor r2, r2, r10, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
bx lr
/******************************************************************************
* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
* Performing both at the same time allows to save some memory accesses.
******************************************************************************/
@ void tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
.global tkschedule_lfsr
.type tkschedule_lfsr,%function
.align 2
tkschedule_lfsr:
push {r0-r12, r14}
ldr.w r3, [r1, #8] //load tk2 (3rd word)
ldr.w r4, [r1, #4] //load tk2 (2nd word)
ldr.w r5, [r1, #12] //load tk2 (4th word)
ldr.w r12, [r1] //load tk2 (1st word)
mov r1, r2 //move tk3 address in r1
mov r2, r12 //move 1st tk2 word in r2
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk2
mov r6, r2 //move tk2 from r2-r5 to r6-r9
mov r7, r3 //move tk2 from r2-r5 to r6-r9
mov r8, r4 //move tk2 from r2-r5 to r6-r9
mov r9, r5 //move tk2 from r2-r5 to r6-r9
ldr.w r3, [r1, #8] //load tk3 (3rd word)
ldr.w r4, [r1, #4] //load tk3 (2nd word)
ldr.w r5, [r1, #12] //load tk3 (4th) word)
ldr.w r2, [r1] //load tk3 (1st) word)
bl packing //pack tk3
eor r10, r10, r10, lsl #4 //r10<- 0xaaaaaaaa
ldr.w r1, [sp, #12] //load loop counter in r1
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #8 //store in tk
loop:
and r12, r8, r10 // --- apply LFSR2 to tk2
eor r12, r12, r6
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r6, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r3, r10 // --- apply LFSR3 to tk3
eor r12, r5, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r5, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r5, r7 //tk2 ^ tk3 (1st word)
eor r12, r2, r8 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r3, r9 //tk2 ^ tk3 (3rd word)
eor r12, r4, r6 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r9, r10 // --- apply LFSR2 to tk2
eor r12, r12, r7
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r7, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r2, r10 // --- apply LFSR3 to tk3
eor r12, r4, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r4, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r4, r8 //tk2 ^ tk3 (1st word)
eor r12, r5, r9 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r2, r6 //tk2 ^ tk3 (3rd word)
eor r12, r3, r7 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r6, r10 // --- apply LFSR2 to tk2
eor r12, r12, r8
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r8, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r5, r10 // --- apply LFSR3 to tk3
eor r12, r3, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r3, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r3, r9 //tk2 ^ tk3 (1st word)
eor r12, r4, r6 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r5, r7 //tk2 ^ tk3 (3rd word)
eor r12, r2, r8 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r7, r10 // --- apply LFSR2 to tk2
eor r12, r12, r9
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r9, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r4, r10 // --- apply LFSR3 to tk3
eor r12, r2, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r2, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
subs.w r1, r1, #8 //decrease loop counter by 8
bne loop
pop {r0-r12, r14}
bx lr
/******************************************************************************
* Applies the permutation P and add the round constants to all round tweakeys.
******************************************************************************/
@ void tkschedule_perm(u32* tk)
.global tkschedule_perm
.type tkschedule_perm,%function
.align 2
tkschedule_perm:
push {r0-r12, lr}
sub.w sp, #4 //to store r14 in subroutines
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r6, r6, r10 //tk &= 0xf0f0f0f0 (1st word)
and r7, r7, r10 //tk &= 0xf0f0f0f0 (2nd word)
and r8, r8, r10 //tk &= 0xf0f0f0f0 (3rd word)
and r9, r9, r10 //tk &= 0xf0f0f0f0 (4th word)
eor r8, r8, #0x00000004 //add rconst
eor r9, r9, #0x00000040 //add rconst
mvn r9, r9 //to remove a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 1st round
strd r6, r7, [r0], #8 //store 2nd half tk for 1st round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 3rd round
strd r6, r7, [r0], #8 //store 2nd half tk for 3rd round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 5th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 5th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 4th round
strd r8, r9, [r0], #24 //store 2nd half tk for 4th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 7th round
strd r6, r7, [r0], #8 //store 2nd half tk for 7th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 9th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000054 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 9th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000410
eor r9, r9, #0x00000410
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 8th round
strd r8, r9, [r0], #24 //store 2nd half tk for 8th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r6, r6, #0x04000000 //add rconst
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 11th round
strd r6, r7, [r0], #8 //store 2nd half tk for 11th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 13th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00140000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 13th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 12th round
strd r8, r9, [r0], #24 //store 2nd half tk for 12th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
eor r9, r9, #0x00000400 //add const
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 15th round
strd r6, r7, [r0], #8 //store 2nd half tk for 15th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 17th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 17th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 16th round
strd r8, r9, [r0], #24 //store 2nd half tk for 16th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 18th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 18th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
eor r9, r9, #0x04000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 19th round
strd r6, r7, [r0], #8 //store 2nd half tk for 19th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 21th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00100000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 21th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x04100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 20th round
strd r8, r9, [r0], #24 //store 2nd half tk for 20th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 22th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 22th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 23th round
strd r6, r7, [r0], #8 //store 2nd half tk for 23th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 25th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 25th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 24th round
strd r8, r9, [r0], #24 //store 2nd half tk for 24th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 26th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 26th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 27th round
strd r6, r7, [r0], #8 //store 2nd half tk for 27th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 29th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 29th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 28th round
strd r8, r9, [r0], #24 //store 2nd half tk for 28th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 30th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 30th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 31th round
strd r6, r7, [r0], #8 //store 2nd half tk for 31th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 33th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 33th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 32th round
strd r8, r9, [r0], #24 //store 2nd half tk for 32th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 34th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 34th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 35th round
strd r6, r7, [r0], #8 //store 2nd half tk for 35th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 37th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 37th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 36th round
strd r8, r9, [r0], #24 //store 2nd half tk for 36th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 38th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 38th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 39th round
strd r6, r7, [r0], #8 //store 2nd half tk for 39th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0x3030
movt r10, #0x3030 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 39th round
strd r8, r9, [r0] //store 2nd half tk for 39th round
add.w sp, #4 //restore stack pointer
pop {r0-r12, lr}
bx lr
/******************************************************************************
* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
* dont need more calculations as no LFSR is applied to TK1.
******************************************************************************/
@ void tkschedule_perm_tk1(u32* tk, const u8* key)
.global tkschedule_perm_tk1
.type tkschedule_perm_tk1,%function
.align 2
tkschedule_perm_tk1:
push {r0-r12, lr}
ldr.w r3, [r1, #8] //load tk1 (3rd word)
ldr.w r4, [r1, #4] //load tk1 (2nd word)
ldr.w r5, [r1, #12] //load tk1 (4th word)
ldr.w r2, [r1] //load tk1 (1st word)
movw r10, #0x0a0a
movt r10, #0x0a0a //r6 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk1
mov r6, r2 //move tk1 from r2-r5 to r6-r9
mov r7, r3 //move tk1 from r2-r5 to r6-r9
mov r8, r4 //move tk1 from r2-r5 to r6-r9
mov r9, r5 //move tk1 from r2-r5 to r6-r9
movw r2, #0xf0f0
movt r2, #0xf0f0 //r2<- 0xf0f0f0f0
and r11, r8, r2 //tk &= 0xf0f0f0f0 (3rd word)
and r12, r9, r2 //tk &= 0xf0f0f0f0 (4th word)
strd r11, r12, [r0], #8 //store 1st half tk for 1st round
and r11, r6, r2 //tk &= 0xf0f0f0f0 (1st word)
and r12, r7, r2 //tk &= 0xf0f0f0f0 (2nd word)
strd r11, r12, [r0], #8 //store 2nd half tk for 1st round
movw r3, #0x3030
movt r3, #0x3030 //r3 <- 0x30303030
and r11, r3, r6, ror #30 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0, #224]
and r11, r3, r7, ror #30
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0, #228]
and r11, r3, r8, ror #30
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0, #232]
and r11, r3, r9, ror #30
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #22 //ror and masks to match fixslicing ---
str.w r12, [r0, #236]
bl p2 //apply the permutation twice
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3
and r11, r3, r6, ror #26 //ror and mask to match fixslicing
and r12, r3, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r3, r8, ror #26 //ror and mask to match fixslicing
and r12, r3, r9, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r3, r3, r3, lsr #6 //r3<- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 4 times
lsl r3, r3, #4 //r3 <- 0x30303030
and r11, r3, r6, ror #14 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r7, ror #14
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r8, ror #14
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r9, ror #14
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #6 //ror and masks to match fixslicing ---
str.w r12, [r0], #4
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation twice
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3<- 0xc3c3c3c3
and r11, r3, r6, ror #10 //ror and mask to match fixslicing
and r12, r3, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r3, r8, ror #10 //ror and mask to match fixslicing
and r12, r3, r9, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r3, r3, r3, lsr #6 //r3<- 0x03030303
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 8 times
lsl r3, r3, #4 //r3 <- 0x30303030
and r11, r3, r6, ror #30 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0], #4
and r11, r3, r7, ror #30
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0], #4
and r11, r3, r8, ror #30
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0], #4
and r11, r3, r9, ror #30
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #22 //ror and masks to match fixslicing ---
str.w r12, [r0], #4
and r11, r2, r6 //ror and mask to match fixslicing
and r12, r2, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 9th round
and r11, r2, r8 //ror and mask to match fixslicing
and r12, r2, r9 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 9th round
bl p2 //apply the permutation 10
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3
and r11, r3, r6, ror #26 //ror and mask to match fixslicing
and r12, r3, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r3, r8, ror #26 //ror and mask to match fixslicing
and r12, r3, r9, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r3, r3, r3, lsr #6 //r3 <- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 12 times
lsl r3, r3, #4 //r3 <- 0x30303030
and r11, r3, r6, ror #14 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r7, ror #14
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r8, ror #14
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r9, ror #14
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #6 //ror and masks to match fixslicing ---
str.w r12, [r0], #4
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 14 times
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3
and r11, r3, r6, ror #10 //ror and mask to match fixslicing
and r12, r3, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r3, r8, ror #10 //ror and mask to match fixslicing
and r12, r3, r9, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r3, r3, r3, lsr #6 //r3 <- 0x03030303
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
pop {r0-r12, lr}
bx lr
.align 2
quadruple_round:
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #30 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #24
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #30
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #2
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #10 // --- mixcolumns 2 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #26
eor r2, r2, r8
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #26
eor r5, r5, r8
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #26
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #22
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
bx lr
/******************************************************************************
* Inverse quadruple round of fixsliced SKINNY-128.
******************************************************************************/
.align 2
inv_quadruple_round:
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #22
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #26
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
and r8, r7, r2, ror #26 // --- mixcolumns 2 ---
eor r2, r2, r8
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #10
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #26
eor r5, r5, r8
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rk2_3 + rconst
eor r3, r3, r9 //add rk2_3 + rconst
eor r4, r4, r10 //add rk2_3 + rconst
eor r5, r5, r11 //add rk2_3 + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #30
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
and r8, r7, r2, ror #6 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #30
eor r2, r2, r8, ror #24
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
bx lr
/******************************************************************************
* Encrypt a single block using fixsliced SKINNY-128-384+.
******************************************************************************/
@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
.global skinny128_384
.type skinny128_384,%function
.align 2
skinny128_384:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r11 <- 0x30303030
bl packing
mov r7, r11
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #256 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #256 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl unpacking
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
/******************************************************************************
* Decrypt a single block using fixsliced SKINNY-128-384+.
******************************************************************************/
@ void skinny128_384_inv(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
.global skinny128_384_inv
.type skinny128_384_inv,%function
.align 2
skinny128_384_inv:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r11 <- 0x30303030
bl packing
mov r7, r11
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
add.w r0, #120 // points to the right rtk1
add.w r1, #632 // points to the last rtk2_3
bl inv_quadruple_round
bl inv_quadruple_round
add.w r0, #256 // rtk1 repeats every 16 rounds
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
add.w r0, #256 // rtk1 repeats every 16 rounds
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
bl unpacking
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
//x ^= y with x, y 128-bit blocks
#define XOR_BLOCK(x,y) ({ \
((u32*)(x))[0] ^= ((u32*)(y))[0]; \
((u32*)(x))[1] ^= ((u32*)(y))[1]; \
((u32*)(x))[2] ^= ((u32*)(y))[2]; \
((u32*)(x))[3] ^= ((u32*)(y))[3]; \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
* functions.
******************************************************************************/
static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, u32* rtk1,
u32* rtk2_3, u64 mlen, const u8* ad, u64 adlen) {
u64 lfsr = 1;
u8 feedback;
u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02);
while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
tkschedule_perm_tk1(rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, rtk1, rtk2_3);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > BLOCKBYTES) { // pad and process 2 blocs
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x03); // domain for padding ad
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
memset(tmp, 0x00, BLOCKBYTES);
memcpy(tmp, ad + BLOCKBYTES, adlen);
tmp[adlen] ^= 0x80; // padding
skinny128_384(tmp + BLOCKBYTES, tmp, ad, tmp, rtk1, rtk2_3);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
} else if (adlen == BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tag);
skinny128_384(auth, c, ad, c, rtk1, rtk2_3);
} else { // if tag has been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tmp); // process last ad block
skinny128_384(auth, auth, ad, ad, rtk1, rtk2_3);
}
} else if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
memcpy(tmp + BLOCKBYTES, ad, adlen); // padding
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tag); // compute the tag
skinny128_384(auth, c, tmp + BLOCKBYTES, c, rtk1, rtk2_3);
} else { // if tag has been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tmp); // process last ad block
skinny128_384(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, rtk1, rtk2_3);
}
}
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u8 feedback;
u64 i,lfsr = 1;
u32 rtk1[8*16];
u32 rtk2_3[8*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(c, c + BLOCKBYTES, m, m + BLOCKBYTES, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
xor_block(sum, m + BLOCKBYTES); // sum for tag computation
mlen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (mlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x01); // domain for padding m
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(c, auth, m, auth, rtk1, rtk2_3);
xor_block(sum, m);
for(i = 0; i < mlen - BLOCKBYTES; i++) {
c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
sum[i] ^= m[BLOCKBYTES + i];
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
m += mlen;
c += mlen;
mlen = 0;
UPDATE_LFSR(lfsr);
} else if (mlen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x04); // domain for tag computation
xor_block(sum, m); // sum for tag computation
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(c, sum, m, sum, rtk1, rtk2_3);
c += BLOCKBYTES;
} else if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x01); // domain for padding
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x05); // domain for tag computation
for(i = 0; i < mlen; i++) // sum for tag computation
sum[i] ^= m[i]; // sum for tag computation
sum[i] ^= 0x80; // padding
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(auth, sum, auth, sum, rtk1, rtk2_3);
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
}
if (mlen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr); // lfsr for tag computation
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
tkschedule_perm_tk1(rtk1, tag, tag);
skinny128_384(sum, sum, sum, sum, rtk1, rtk2_3); // compute the tag
}
}
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, mlen, ad, adlen);
xor_block(sum, auth);
memcpy(c, sum, TAGBYTES);
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Decryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u8 feedback;
u64 i,lfsr = 1;
u32 rtk1[8*16];
u32 rtk2_3[8*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES];
u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_inv(m, m + BLOCKBYTES, c, c + BLOCKBYTES, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
xor_block(sum, m + BLOCKBYTES); // sum for tag computation
clen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (clen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
xor_block(sum, m);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x01); // domain for padding m
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
for(i = 0; i < clen - BLOCKBYTES; i++) {
m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
sum[i] ^= m[BLOCKBYTES + i];
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
c += clen;
clen = 0;
UPDATE_LFSR(lfsr);
} else if (clen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
SET_DOMAIN(tag, 0x04); // domain for tag computation
UPDATE_LFSR(lfsr);
c += BLOCKBYTES;
clen = 0;
} else if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x01); // domain for padding
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
c += clen;
clen = 0;
}
if (clen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr);
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
tkschedule_perm_tk1(rtk1, tag, tag); //if AD can be processed in //
skinny128_384(sum, sum, sum, sum, rtk1, rtk2_3); // compute the tag
}
}
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, clen, ad, adlen);
xor_block(sum, auth);
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 40
extern void skinny128_384(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const u32* rtk1, const u32* rtk2_3);
extern void skinny128_384_inv(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const u32* rtk1, const u32* rtk2_3);
extern void tkschedule_lfsr_2(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
extern void pack_tk1(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
extern void tkschedule_lfsr_3(u32* rtk, const u8* tk3, const u8* tk3_bis, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
#endif // SKINNY128_H_
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
//x ^= y with x, y 128-bit blocks
#define XOR_BLOCK(x,y) ({ \
((u32*)(x))[0] ^= ((u32*)(y))[0]; \
((u32*)(x))[1] ^= ((u32*)(y))[1]; \
((u32*)(x))[2] ^= ((u32*)(y))[2]; \
((u32*)(x))[3] ^= ((u32*)(y))[3]; \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
precompute_rtk2_3(rtk2_3, npub, k);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(c + mlen, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384_plus_encrypt(c, m, rtk1, rtk2_3);
xor_block(c + mlen, m); // sum for tag computation
mlen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update lfsr for next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
for(i = 0; i < mlen; i++)
c[mlen + i] ^= m[i]; // sum for tag computation
c[mlen + i] ^= 0x80; // padding
precompute_rtk1(rtk1, tmp);
skinny128_384_plus_encrypt(auth, auth, rtk1, rtk2_3);
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
precompute_rtk1(rtk1, tmp);
skinny128_384_plus_encrypt(c, c, rtk1, rtk2_3); // compute the tag
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp);
skinny128_384_plus_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
precompute_rtk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] ^= 0x80; // padding
skinny128_384_plus_encrypt(tmp, tmp, rtk1, rtk2_3);
xor_block(auth, tmp);
}
xor_block(c, auth); // XOR for tag computation
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
precompute_rtk2_3(rtk2_3, npub, k);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384_plus_decrypt(m, c, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
clen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update LFSR for the next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
precompute_rtk1(rtk1, tmp);
skinny128_384_plus_encrypt(auth, auth, rtk1, rtk2_3);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
c += clen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
precompute_rtk1(rtk1, tmp);
skinny128_384_plus_encrypt(sum, sum, rtk1, rtk2_3); // compute the tag
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp);
skinny128_384_plus_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
precompute_rtk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] ^= 0x80; // padding
skinny128_384_plus_encrypt(tmp, tmp, rtk1, rtk2_3);
xor_block(auth, tmp);
}
xor_block(sum, auth); // XOR for tag computation
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
******************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 1
******************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 2
******************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 3
******************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
}
}
/******************************************************************************
* Encryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
unpacking(ctext, state); // from bitsliced to byte representation
}
/******************************************************************************
* Decryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
#include "tk_schedule.h"
void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
#define SKINNY128_384_ROUNDS 40
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
mixcolumns_0(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
mixcolumns_1(state); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
mixcolumns_2(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
mixcolumns_3(state); \
})
#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
inv_mixcolumns_3(state); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
inv_mixcolumns_2(state); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
inv_mixcolumns_1(state); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
inv_mixcolumns_0(state); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
})
#endif // SKINNY128_H_
\ No newline at end of file
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
/******************************************************************************
* Implementation of the SKINNY tweakey schedule to match fixslicing.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
u32 rconst_32_bs[160] = {
0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff,
0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
0x00000010, 0x00000000, 0x00010010, 0xfffffbff
};
/******************************************************************************
* Pack the input into the bitsliced representation
* 24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
* 25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
* 26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
* 27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
******************************************************************************/
void packing(u32* out, const u8* in) {
u32 tmp;
LE_LOAD(out, in);
LE_LOAD(out + 1, in + 8);
LE_LOAD(out + 2, in + 4);
LE_LOAD(out + 3, in + 12);
SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[3], out[0], 0x03030303, 6);
SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[3], out[2], 0x03030303, 4);
SWAPMOVE(out[3], out[1], 0x03030303, 2);
}
/******************************************************************************
* Unpack the input to a byte-wise representation
******************************************************************************/
void unpacking(u8* out, u32 *in) {
u32 tmp;
SWAPMOVE(in[3], in[1], 0x03030303, 2);
SWAPMOVE(in[3], in[2], 0x03030303, 4);
SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[3], in[0], 0x03030303, 6);
SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
LE_STORE(out, in[0]);
LE_STORE(out + 8, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out + 12, in[3]);
}
/******************************************************************************
* 0 4 1 5
* 1 5 ---> 2 6
* 2 6 3 7
* 3 7 4 0
******************************************************************************/
void lfsr2_bs(u32* tk) {
u32 tmp;
tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[0] = tk[1];
tk[1] = tk[2];
tk[2] = tk[3];
tk[3] = tmp;
}
/******************************************************************************
* 0 4 7 3
* 1 5 ---> 0 4
* 2 6 1 5
* 3 7 2 6
******************************************************************************/
void lfsr3_bs(u32* tk) {
u32 tmp;
tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[3] = tk[2];
tk[2] = tk[1];
tk[1] = tk[0];
tk[0] = tmp;
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, twice
******************************************************************************/
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 4 times
******************************************************************************/
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 6 times
******************************************************************************/
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 8 times
******************************************************************************/
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 10 times
******************************************************************************/
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 12 times
******************************************************************************/
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 14 times
******************************************************************************/
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00;
tk[i] |= ROR(tmp,14) & 0x00cc0000;
tk[i] |= ROR(tmp,30) & 0xcc000000;
tk[i] |= ROR(tmp,16) & 0x000000ff;
tk[i] |= ROR(tmp,18) & 0x33003300;
}
}
/******************************************************************************
* Precompute all LFSRs on TK2
******************************************************************************/
void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
u32 tk2[4];
packing(tk2, key);
memcpy(tk, tk2, 16);
for(int i = 0 ; i < rounds; i+=2) {
lfsr2_bs(tk2);
memcpy(tk+i*4+4, tk2, 16);
}
}
/******************************************************************************
* Precompute all LFSRs on TK3
******************************************************************************/
void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
u32 tk3[4];
packing(tk3, key);
tk[0] ^= tk3[0];
tk[1] ^= tk3[1];
tk[2] ^= tk3[2];
tk[3] ^= tk3[3];
for(int i = 0 ; i < rounds; i+=2) {
lfsr3_bs(tk3);
tk[i*4+4] ^= tk3[0];
tk[i*4+5] ^= tk3[1];
tk[i*4+6] ^= tk3[2];
tk[i*4+7] ^= tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
for(int j = 0; j < 4; j++) {
tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
}
tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
}
if (test && (i+8 < rounds)) { //only if next loop iteration
tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
}
}
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
rtk[i*4+j] ^= rconst_32_bs[i*4+j];
}
}
/******************************************************************************
* Precompute RTK1.
******************************************************************************/
void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16);
}
\ No newline at end of file
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#endif // TK_SCHEDULE_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
* functions.
******************************************************************************/
static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u64 mlen, const u8* ad, u64 adlen) {
u64 lfsr = 1;
u8 feedback;
u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02);
while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x03); // domain for padding ad
precompute_rtk1(tk->rtk1, tmp, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
memset(tmp, 0x00, BLOCKBYTES);
memcpy(tmp, ad + BLOCKBYTES, adlen);
tmp[adlen] ^= 0x80; // padding
skinny128_384_plus_encrypt(tmp + BLOCKBYTES, tmp, ad, tmp, *tk);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
} else if (adlen == BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_plus_encrypt(auth, c, ad, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_plus_encrypt(auth, auth, ad, ad, *tk);
}
} else if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
memcpy(tmp + BLOCKBYTES, ad, adlen); // padding
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_plus_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_plus_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
}
}
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
tweakey tk;
u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(c + mlen, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for 2nd block
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_plus_encrypt(c, c + BLOCKBYTES, m, m + BLOCKBYTES, tk);
xor_block(c + mlen, m); // sum for tag computation
xor_block(c + mlen, m + BLOCKBYTES); // sum for tag computation
mlen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (mlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for 2nd block
SET_DOMAIN(tmp + BLOCKBYTES, 0x01); // domain for padding m
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_plus_encrypt(c, auth, m, auth, tk);
xor_block(c + mlen, m);
for(i = 0; i < mlen - BLOCKBYTES; i++) {
c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
c[mlen + i] ^= m[BLOCKBYTES + i];
}
c[mlen + i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
m += mlen;
c += mlen;
mlen = 0;
UPDATE_LFSR(lfsr);
} else if (mlen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr); // lfsr for last full block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for tag computation
SET_DOMAIN(tmp + BLOCKBYTES, 0x04); // domain for tag computation
xor_block(c + mlen, m); // sum for tag computation
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_plus_encrypt(c, c + mlen, m, c + mlen, tk);
c += BLOCKBYTES;
} else if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for tag computation
SET_DOMAIN(tmp + BLOCKBYTES, 0x05); // domain for tag computation
for(i = 0; i < mlen; i++) // sum for tag computation
c[mlen + i] ^= m[i]; // sum for tag computation
c[mlen + i] ^= 0x80; // padding
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_plus_encrypt(auth, c + mlen, auth, c + mlen, tk);
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
}
if (mlen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr); // lfsr for tag computation
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) { //if all AD can be processed in //
precompute_rtk1(tk.rtk1, tag, tag);
skinny128_384_plus_encrypt(c, c, c, c, tk); // compute the tag
}
}
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, c, tag, &tk, mlen, ad, adlen);
xor_block(c, auth);
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Decryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
tweakey tk;
u8 tmp[2*BLOCKBYTES];
u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for 2nd block
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_plus_decrypt(m, m + BLOCKBYTES, c, c + BLOCKBYTES, tk);
xor_block(sum, m); // sum for tag computation
xor_block(sum, m + BLOCKBYTES); // sum for tag computation
clen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (clen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_plus_decrypt(m, m, c, c, tk);
xor_block(sum, m);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp, lfsr); // lfsr for 2nd block
SET_DOMAIN(tmp, 0x01); // domain for padding m
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_plus_encrypt(auth, auth, auth, auth, tk);
for(i = 0; i < clen - BLOCKBYTES; i++) {
m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
sum[i] ^= m[BLOCKBYTES + i];
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
m += clen;
c += clen;
clen = 0;
UPDATE_LFSR(lfsr);
} else if (clen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr); // lfsr for last full block
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_plus_decrypt(m, m, c, c, tk);
xor_block(sum, m); // sum for tag computation
SET_DOMAIN(tag, 0x04); // domain for tag computation
UPDATE_LFSR(lfsr);
c += BLOCKBYTES;
clen = 0;
} else if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_plus_encrypt(auth, auth, auth, auth, tk);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
m += clen;
c += clen;
clen = 0;
}
if (clen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr); // lfsr for tag computation
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
precompute_rtk1(tk.rtk1, tag, tag); //if AD can be processed in //
skinny128_384_plus_encrypt(sum, sum, sum, sum, tk); // compute the tag
}
}
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, sum, tag, &tk, clen, ad, adlen);
xor_block(sum, auth);
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
/******************************************************************************
* Fixsliced implementation of SKINNY-128-384.
* Two blocks are processed in parallel.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. However, the Sbox
* permutation requires 8 rounds for a synchronization. To limit the impact
* on code size, we compute the permutation every 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0.
****************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 1.
****************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 2.
****************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 3.
****************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
****************************************************************************/
void inv_mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
****************************************************************************/
void inv_mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
****************************************************************************/
void inv_mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
****************************************************************************/
void inv_mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
}
}
/****************************************************************************
* Adds the tweakey (including the round constants) to the state.
****************************************************************************/
void add_tweakey(u32* state, const u32* rtk1, const u32* rtk2_3) {
state[0] ^= rtk1[0] ^ rtk2_3[0];
state[1] ^= rtk1[1] ^ rtk2_3[1];
state[2] ^= rtk1[2] ^ rtk2_3[2];
state[3] ^= rtk1[3] ^ rtk2_3[3];
state[4] ^= rtk1[4] ^ rtk2_3[4];
state[5] ^= rtk1[5] ^ rtk2_3[5];
state[6] ^= rtk1[6] ^ rtk2_3[6];
state[7] ^= rtk1[7] ^ rtk2_3[7];
}
/****************************************************************************
* Encryption of 2 blocks in parallel using SKINNY-128-384.
* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
****************************************************************************/
void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk) {
u32 state[8];
packing(state, ptext, ptext_bis);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
unpacking(ctext, ctext_bis, state);
}
/****************************************************************************
* Decryption of 2 blocks in parallel using SKINNY-128-384.
* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
****************************************************************************/
void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const u8* ctext_bis, const tweakey tk) {
u32 state[8];
packing(state, ctext, ctext_bis);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
unpacking(ptext, ptext_bis, state);
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
#include "tk_schedule.h"
void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk);
void skinny128_384_plus_decrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk);
#define SKINNY128_384_ROUNDS 40
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= (state[0] | state[1]); \
state[7] ^= (state[4] | state[5]); \
state[1] ^= (state[6] | state[5]); \
state[2] ^= (state[3] & state[7]); \
state[6] ^= (~state[7] | state[4]); \
state[0] ^= (state[2] | ~state[1]); \
state[4] ^= (~state[3] | state[2]); \
state[5] ^= (state[6] & state[0]); \
add_tweakey(state, rtk1, rtk2_3); \
mixcolumns_0(state); \
state[4] ^= (state[2] | state[3]); \
state[5] ^= (state[6] | state[1]); \
state[3] ^= (state[0] | state[1]); \
state[7] ^= (state[4] & state[5]); \
state[0] ^= (~state[5] | state[6]); \
state[2] ^= (state[7] | ~state[3]); \
state[6] ^= (~state[4] | state[7]); \
state[1] ^= (state[0] & state[2]); \
add_tweakey(state, rtk1+8, rtk2_3+8); \
mixcolumns_1(state); \
state[6] ^= (state[7] | state[4]); \
state[1] ^= (state[0] | state[3]); \
state[4] ^= (state[2] | state[3]); \
state[5] ^= (state[6] & state[1]); \
state[2] ^= (~state[1] | state[0]); \
state[7] ^= (state[5] | ~state[4]); \
state[0] ^= (~state[6] | state[5]); \
state[3] ^= (state[2] & state[7]); \
add_tweakey(state, rtk1+16, rtk2_3+16); \
mixcolumns_2(state); \
state[0] ^= (state[5] | state[6]); \
state[3] ^= (state[2] | state[4]); \
state[6] ^= (state[7] | state[4]); \
state[1] ^= (state[0] & state[3]); \
state[7] ^= (~state[3] | state[2]); \
state[5] ^= (state[1] | ~state[6]); \
state[2] ^= (~state[0] | state[1]); \
state[4] ^= (state[7] & state[5]); \
add_tweakey(state, rtk1+24, rtk2_3+24); \
mixcolumns_3(state); \
state[0] ^= state[1]; \
state[1] ^= state[0]; \
state[0] ^= state[1]; \
state[2] ^= state[3]; \
state[3] ^= state[2]; \
state[2] ^= state[3]; \
state[4] ^= state[7]; \
state[7] ^= state[4]; \
state[4] ^= state[7]; \
state[5] ^= state[6]; \
state[6] ^= state[5]; \
state[5] ^= state[6]; \
})
#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[0] ^= state[1]; \
state[1] ^= state[0]; \
state[0] ^= state[1]; \
state[2] ^= state[3]; \
state[3] ^= state[2]; \
state[2] ^= state[3]; \
state[4] ^= state[7]; \
state[7] ^= state[4]; \
state[4] ^= state[7]; \
state[5] ^= state[6]; \
state[6] ^= state[5]; \
state[5] ^= state[6]; \
inv_mixcolumns_3(state); \
add_tweakey(state, rtk1+24, rtk2_3+24); \
state[4] ^= (state[7] & state[5]); \
state[2] ^= (~state[0] | state[1]); \
state[5] ^= (state[1] | ~state[6]); \
state[7] ^= (~state[3] | state[2]); \
state[1] ^= (state[0] & state[3]); \
state[6] ^= (state[7] | state[4]); \
state[3] ^= (state[2] | state[4]); \
state[0] ^= (state[5] | state[6]); \
inv_mixcolumns_2(state); \
add_tweakey(state, rtk1+16, rtk2_3+16); \
state[3] ^= (state[2] & state[7]); \
state[0] ^= (~state[6] | state[5]); \
state[7] ^= (state[5] | ~state[4]); \
state[2] ^= (~state[1] | state[0]); \
state[5] ^= (state[6] & state[1]); \
state[4] ^= (state[2] | state[3]); \
state[1] ^= (state[0] | state[3]); \
state[6] ^= (state[7] | state[4]); \
inv_mixcolumns_1(state); \
add_tweakey(state, rtk1+8, rtk2_3+8); \
state[1] ^= (state[0] & state[2]); \
state[6] ^= (~state[4] | state[7]); \
state[2] ^= (state[7] | ~state[3]); \
state[0] ^= (~state[5] | state[6]); \
state[7] ^= (state[4] & state[5]); \
state[3] ^= (state[0] | state[1]); \
state[5] ^= (state[6] | state[1]); \
state[4] ^= (state[2] | state[3]); \
inv_mixcolumns_0(state); \
add_tweakey(state, rtk1, rtk2_3); \
state[5] ^= (state[6] & state[0]); \
state[4] ^= (~state[3] | state[2]); \
state[0] ^= (state[2] | ~state[1]); \
state[6] ^= (~state[7] | state[4]); \
state[2] ^= (state[3] & state[7]); \
state[1] ^= (state[6] | state[5]); \
state[7] ^= (state[4] | state[5]); \
state[3] ^= (state[0] | state[1]); \
})
#endif // SKINNY128_H_
\ No newline at end of file
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
/*******************************************************************************
* Implementation of the tweakey schedule according to the fixsliced
* representation.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "tk_schedule.h"
typedef unsigned char u8;
typedef unsigned int u32;
/****************************************************************************
* The round constants according to the fixsliced representation.
****************************************************************************/
u32 rconst_32_bs[320] = {
0xfffffff3, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000300, 0xcffffcff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xffcffffc, 0xffcfffff, 0xffcfffff,
0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff,
0xfcffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000c00,
0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c30, 0xfffcf3cf, 0xffffffff, 0xffffffcf,
0xffffff03, 0xffffff3f, 0x00000000, 0xffffffff,
0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff,
0xfffffcff, 0x00000300, 0xcffffc3f, 0xfffffcff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xf3ffffff, 0x00000300,
0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x0c000000, 0xf3fffffc, 0xffcfffff, 0xffcfffff,
0xffc3ffff, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff,
0xffffffff, 0x03c00000, 0xfc3fcfff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000c00,
0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff33ff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000000, 0xfffcffcf, 0xffffffcf, 0xffffffcf,
0xfffffff3, 0xffffff3f, 0x00000000, 0xffffffff,
0xffffffff, 0x000000f0, 0xffffff3f, 0xffffffff,
0xfffffcff, 0x000000c0, 0xcffffc3f, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x0c000300,
0xf3ffffff, 0x00000000, 0xffffffff, 0x3ffffcff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xf3cffffc, 0xffffffff, 0xffcfffff,
0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00300000, 0xffffffff, 0xffffffff,
0xfcffffff, 0x00000000, 0xff3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
0xffffffff, 0x00000000, 0xffffffff, 0xffff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffff,
0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000000, 0xcffffcff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
0xf3ffffff, 0x00000000, 0xffffffff, 0x3fffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xffcffffc, 0xffffffff, 0xffcfffff,
0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xffffffff, 0xffffffff,
0xfcffffff, 0x00000000, 0xfc3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
0xfffff3ff, 0x00000000, 0xffffffff, 0xffff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffcf,
0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000300, 0xcffffc3f, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xfffffffc, 0xffcfffff, 0xffcfffff,
0xff33ffff, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff,
0xffffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x00000c00,
0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c00, 0xfffcffcf, 0xffffffff, 0xffffffcf
};
/****************************************************************************
* Packs 2 input blocks B, B' into the state using a bitsliced representation.
* Once the packing process is complete, the 256-bit state consists of 8
* 32-bit word and the input blocks bit positioning is as follows:
*
* 24 24' 56 56' 88 88' 120 120' | ... | 0 0' 32 32' 64 64' 96 96'
* 25 25' 57 57' 89 89' 121 121' | ... | 1 1' 33 33' 65 65' 97 97'
* 26 26' 58 58' 90 90' 122 122' | ... | 2 2' 34 34' 66 66' 98 98'
* 27 27' 59 59' 91 91' 123 123' | ... | 3 3' 35 35' 67 67' 99 99'
* 28 28' 60 60' 92 92' 124 124' | ... | 4 4' 36 36' 68 68' 100 100'
* 29 29' 61 61' 93 93' 125 125' | ... | 5 5' 37 37' 69 69' 101 101'
* 30 30' 62 62' 94 94' 126 126' | ... | 6 6' 38 38' 70 70' 102 102'
* 31 31' 63 63' 95 95' 127 127' | ... | 7 7' 39 39' 71 71' 103 103'
****************************************************************************/
void packing(u32* out, const u8* block0, const u8* block1) {
u32 tmp;
LE_LOAD(out, block0);
LE_LOAD(out + 1, block1);
LE_LOAD(out + 2, block0 + 4);
LE_LOAD(out + 3, block1 + 4);
LE_LOAD(out + 4, block0 + 8);
LE_LOAD(out + 5, block1 + 8);
LE_LOAD(out + 6, block0 + 12);
LE_LOAD(out + 7, block1 + 12);
SWAPMOVE(out[1], out[0], 0x55555555, 1);
SWAPMOVE(out[3], out[2], 0x55555555, 1);
SWAPMOVE(out[5], out[4], 0x55555555, 1);
SWAPMOVE(out[7], out[6], 0x55555555, 1);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[4], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[6], out[0], 0x03030303, 6);
SWAPMOVE(out[3], out[1], 0x30303030, 2);
SWAPMOVE(out[5], out[1], 0x0c0c0c0c, 4);
SWAPMOVE(out[7], out[1], 0x03030303, 6);
SWAPMOVE(out[4], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[6], out[2], 0x03030303, 4);
SWAPMOVE(out[5], out[3], 0x0c0c0c0c, 2);
SWAPMOVE(out[7], out[3], 0x03030303, 4);
SWAPMOVE(out[6], out[4], 0x03030303, 2);
SWAPMOVE(out[7], out[5], 0x03030303, 2);
}
/****************************************************************************
* Unacks the 256-bit state into the 32-byte output byte array.
* Once the unpacking process is complete, the byte ordering within the output
* array is as follows:
*
* 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
* 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
****************************************************************************/
void unpacking(u8* out, u8* out_bis, u32 *in) {
u32 tmp;
SWAPMOVE(in[6], in[4], 0x03030303, 2);
SWAPMOVE(in[7], in[5], 0x03030303, 2);
SWAPMOVE(in[5], in[3], 0x0c0c0c0c, 2);
SWAPMOVE(in[7], in[3], 0x03030303, 4);
SWAPMOVE(in[4], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[6], in[2], 0x03030303, 4);
SWAPMOVE(in[7], in[1], 0x03030303, 6);
SWAPMOVE(in[5], in[1], 0x0c0c0c0c, 4);
SWAPMOVE(in[3], in[1], 0x30303030, 2);
SWAPMOVE(in[6], in[0], 0x03030303, 6);
SWAPMOVE(in[4], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[1], in[0], 0x55555555, 1);
SWAPMOVE(in[3], in[2], 0x55555555, 1);
SWAPMOVE(in[5], in[4], 0x55555555, 1);
SWAPMOVE(in[7], in[6], 0x55555555, 1);
LE_STORE(out, in[0]);
LE_STORE(out_bis, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out_bis + 4, in[3]);
LE_STORE(out + 8, in[4]);
LE_STORE(out_bis + 8, in[5]);
LE_STORE(out + 12, in[6]);
LE_STORE(out_bis + 12, in[7]);
}
//Apply the permutation in a bitsliced manner, twice
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
//Apply the permutation in a bitsliced manner, 4 times
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
//Apply the permutation in a bitsliced manner, 6 times
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
//Apply the permutation in a bitsliced manner, 8 times
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
//Apply the permutation in a bitsliced manner, 10 times
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
//Apply the permutation in a bitsliced manner, 12 times
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
//Apply the permutation in a bitsliced manner, 14 times
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00; //red
tk[i] |= ROR(tmp,14) & 0x00cc0000; //green
tk[i] |= ROR(tmp,30) & 0xcc000000; //blue
tk[i] |= ROR(tmp,16) & 0x000000ff; //yellow
tk[i] |= ROR(tmp,18) & 0x33003300; //purp
}
}
void precompute_lfsr_tk2(u32* tk, const u8* tk2_0,
const u8* tk2_1, const int rounds) {
u32 tmp;
u32 state[8];
packing(state, tk2_0, tk2_1);
memcpy(tk, state, 32);
for(int i = 0 ; i < rounds; i+=2) {
LFSR2(state);
memcpy(tk+i*8+8, state, 32);
}
}
void precompute_lfsr_tk3(u32* tk, const u8* tk3_0,
const u8* tk3_1, const int rounds) {
u32 tmp;
u32 state[8];
packing(state, tk3_0, tk3_1);
for(int i = 0; i < 8; i++)
tk[i] ^= state[i];
for(int i = 0 ; i < rounds; i+=2) {
LFSR3(state);
tk[i*8+8] ^= state[0];
tk[i*8+9] ^= state[1];
tk[i*8+10] ^= state[2];
tk[i*8+11] ^= state[3];
tk[i*8+12] ^= state[4];
tk[i*8+13] ^= state[5];
tk[i*8+14] ^= state[6];
tk[i*8+15] ^= state[7];
}
}
/****************************************************************************
* XOR with TK with TK1 before applying the permutations.
* The key is then rearranged to match the fixsliced representation.
****************************************************************************/
void permute_tk(u32* tk, const u8* tk1_0, const u8* tk1_1, const int rounds) {
u32 test;
u32 tk1[8], tmp[8];
packing(tk1, tk1_0, tk1_1);
memcpy(tmp, tk, 32);
XOR_BLOCK(tmp, tk1);
tk[0] = tmp[6] & 0xf0f0f0f0; //mask to extract rows 1&2 only
tk[1] = tmp[5] & 0xf0f0f0f0;
tk[2] = tmp[0] & 0xf0f0f0f0;
tk[3] = tmp[1] & 0xf0f0f0f0;
tk[4] = tmp[3] & 0xf0f0f0f0;
tk[5] = tmp[7] & 0xf0f0f0f0;
tk[6] = tmp[4] & 0xf0f0f0f0;
tk[7] = tmp[2] & 0xf0f0f0f0;
for(int i = 0 ; i < rounds; i+=8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
memcpy(tmp, tk+i*8+8, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*8+8] = ROR(tmp[4],26) & 0xc3c3c3c3; //mask to extract rows 1&2 only
tk[i*8+9] = ROR(tmp[7],26) & 0xc3c3c3c3; //rotation to match fixslicing
tk[i*8+10] = ROR(tmp[6],26) & 0xc3c3c3c3;
tk[i*8+11] = ROR(tmp[5],26) & 0xc3c3c3c3;
tk[i*8+12] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*8+13] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*8+14] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*8+15] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*8+16] = ROR(tmp[3],28) & 0x03030303; //mask to extract rows 1&2 only
tk[i*8+16] |= ROR(tmp[3],12) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+17] = ROR(tmp[2],28) & 0x03030303;
tk[i*8+17] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*8+18] = ROR(tmp[4],28) & 0x03030303;
tk[i*8+18] |= ROR(tmp[4],12) & 0x0c0c0c0c;
tk[i*8+19] = ROR(tmp[7],28) & 0x03030303;
tk[i*8+19] |= ROR(tmp[7],12) & 0x0c0c0c0c;
tk[i*8+20] = ROR(tmp[5],28) & 0x03030303;
tk[i*8+20] |= ROR(tmp[5],12) & 0x0c0c0c0c;
tk[i*8+21] = ROR(tmp[0],28) & 0x03030303;
tk[i*8+21] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*8+22] = ROR(tmp[1],28) & 0x03030303;
tk[i*8+22] |= ROR(tmp[1],12) & 0x0c0c0c0c;
tk[i*8+23] = ROR(tmp[6],28) & 0x03030303;
tk[i*8+23] |= ROR(tmp[6],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*8+24, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
tk[i*8+24] = ROR(tmp[1],14) & 0x30303030; //mask to extract rows 1&2 only
tk[i*8+24] |= ROR(tmp[1],6) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+25] = ROR(tmp[0],14) & 0x30303030;
tk[i*8+25] |= ROR(tmp[0],6) & 0x0c0c0c0c;
tk[i*8+26] = ROR(tmp[3],14) & 0x30303030;
tk[i*8+26] |= ROR(tmp[3],6) & 0x0c0c0c0c;
tk[i*8+27] = ROR(tmp[2],14) & 0x30303030;
tk[i*8+27] |= ROR(tmp[2],6) & 0x0c0c0c0c;
tk[i*8+28] = ROR(tmp[7],14) & 0x30303030;
tk[i*8+28] |= ROR(tmp[7],6) & 0x0c0c0c0c;
tk[i*8+29] = ROR(tmp[6],14) & 0x30303030;
tk[i*8+29] |= ROR(tmp[6],6) & 0x0c0c0c0c;
tk[i*8+30] = ROR(tmp[5],14) & 0x30303030;
tk[i*8+30] |= ROR(tmp[5],6) & 0x0c0c0c0c;
tk[i*8+31] = ROR(tmp[4],14) & 0x30303030;
tk[i*8+31] |= ROR(tmp[4],6) & 0x0c0c0c0c;
tk[i*8+32] = ROR(tmp[6],16) & 0xf0f0f0f0; //mask to extract rows 1&2 only
tk[i*8+33] = ROR(tmp[5],16) & 0xf0f0f0f0; //rotation to match fixslicing
tk[i*8+34] = ROR(tmp[0],16) & 0xf0f0f0f0;
tk[i*8+35] = ROR(tmp[1],16) & 0xf0f0f0f0;
tk[i*8+36] = ROR(tmp[3],16) & 0xf0f0f0f0;
tk[i*8+37] = ROR(tmp[7],16) & 0xf0f0f0f0;
tk[i*8+38] = ROR(tmp[4],16) & 0xf0f0f0f0;
tk[i*8+39] = ROR(tmp[2],16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*8+40, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*8+40] = ROR(tmp[4],10) & 0xc3c3c3c3; //mask to extract rows 1&2 only
tk[i*8+41] = ROR(tmp[7],10) & 0xc3c3c3c3; //rotation to match fixslicing
tk[i*8+42] = ROR(tmp[6],10) & 0xc3c3c3c3;
tk[i*8+43] = ROR(tmp[5],10) & 0xc3c3c3c3;
tk[i*8+44] = ROR(tmp[1],10) & 0xc3c3c3c3;
tk[i*8+45] = ROR(tmp[2],10) & 0xc3c3c3c3;
tk[i*8+46] = ROR(tmp[3],10) & 0xc3c3c3c3;
tk[i*8+47] = ROR(tmp[0],10) & 0xc3c3c3c3;
tk[i*8+48] = ROR(tmp[3],12) & 0x03030303; //mask to extract rows 1&2 only
tk[i*8+48] |= ROR(tmp[3],28) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+49] = ROR(tmp[2],12) & 0x03030303;
tk[i*8+49] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*8+50] = ROR(tmp[4],12) & 0x03030303;
tk[i*8+50] |= ROR(tmp[4],28) & 0x0c0c0c0c;
tk[i*8+51] = ROR(tmp[7],12) & 0x03030303;
tk[i*8+51] |= ROR(tmp[7],28) & 0x0c0c0c0c;
tk[i*8+52] = ROR(tmp[5],12) & 0x03030303;
tk[i*8+52] |= ROR(tmp[5],28) & 0x0c0c0c0c;
tk[i*8+53] = ROR(tmp[0],12) & 0x03030303;
tk[i*8+53] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*8+54] = ROR(tmp[1],12) & 0x03030303;
tk[i*8+54] |= ROR(tmp[1],28) & 0x0c0c0c0c;
tk[i*8+55] = ROR(tmp[6],12) & 0x03030303;
tk[i*8+55] |= ROR(tmp[6],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*8+56, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
tk[i*8+56] = ROR(tmp[1],30) & 0x30303030; //mask to extract rows 1&2 only
tk[i*8+56] |= ROR(tmp[1],22) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+57] = ROR(tmp[0],30) & 0x30303030;
tk[i*8+57] |= ROR(tmp[0],22) & 0x0c0c0c0c;
tk[i*8+58] = ROR(tmp[3],30) & 0x30303030;
tk[i*8+58] |= ROR(tmp[3],22) & 0x0c0c0c0c;
tk[i*8+59] = ROR(tmp[2],30) & 0x30303030;
tk[i*8+59] |= ROR(tmp[2],22) & 0x0c0c0c0c;
tk[i*8+60] = ROR(tmp[7],30) & 0x30303030;
tk[i*8+60] |= ROR(tmp[7],22) & 0x0c0c0c0c;
tk[i*8+61] = ROR(tmp[6],30) & 0x30303030;
tk[i*8+61] |= ROR(tmp[6],22) & 0x0c0c0c0c;
tk[i*8+62] = ROR(tmp[5],30) & 0x30303030;
tk[i*8+62] |= ROR(tmp[5],22) & 0x0c0c0c0c;
tk[i*8+63] = ROR(tmp[4],30) & 0x30303030;
tk[i*8+63] |= ROR(tmp[4],22) & 0x0c0c0c0c;
//if (test && (i+8 < rounds)) { //only if next loop iteration
if (i+8 < rounds) { //only if next loop iteration
tk[i*8+64] = tmp[6] & 0xf0f0f0f0; //mask to extract rows 1&2 only
tk[i*8+65] = tmp[5] & 0xf0f0f0f0;
tk[i*8+66] = tmp[0] & 0xf0f0f0f0;
tk[i*8+67] = tmp[1] & 0xf0f0f0f0;
tk[i*8+68] = tmp[3] & 0xf0f0f0f0;
tk[i*8+69] = tmp[7] & 0xf0f0f0f0;
tk[i*8+70] = tmp[4] & 0xf0f0f0f0;
tk[i*8+71] = tmp[2] & 0xf0f0f0f0;
}
}
}
//Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3, int rounds) {
memset(rtk, 0x00, 32*rounds);
precompute_lfsr_tk2(rtk, tk2, tk2, rounds);
precompute_lfsr_tk3(rtk, tk3, tk3, rounds);
permute_tk(rtk, (u8*)(rtk+16), (u8*)(rtk+16), rounds); // rtk+16 is NULL
for(int i = 0; i < rounds; i++) { // add rconsts
for(int j = 0; j < 8; j++)
rtk[i*8+j] ^= rconst_32_bs[i*8+j];
}
}
//Precompute TK1
void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis) {
memset(rtk1, 0x00, 32*16);
permute_tk(rtk1, tk1, tk1_bis, 16);
}
#ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
typedef unsigned char u8;
typedef unsigned int u32;
typedef struct {
u32 rtk1[8*16];
u32 rtk2_3[8*40];
} tweakey;
void packing(u32* out, const u8* block0, const u8* block1);
void unpacking(u8* out, u8* out_bis, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3, int rounds);
void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
#define LFSR2(tk) ({ \
tmp = (tk)[0] ^ (tk)[2]; \
(tk)[0] = (tk)[1]; \
(tk)[1] = (tk)[2]; \
(tk)[2] = (tk)[3]; \
(tk)[3] = (tk)[4]; \
(tk)[4] = (tk)[5]; \
(tk)[5] = (tk)[6]; \
(tk)[6] = (tk)[7]; \
(tk)[7] = tmp; \
})
#define LFSR3(tk) ({ \
tmp = (tk)[7] ^ (tk)[1]; \
(tk)[7] = (tk)[6]; \
(tk)[6] = (tk)[5]; \
(tk)[5] = (tk)[4]; \
(tk)[4] = (tk)[3]; \
(tk)[3] = (tk)[2]; \
(tk)[2] = (tk)[1]; \
(tk)[1] = (tk)[0]; \
(tk)[0] = tmp; \
})
#define XOR_BLOCK(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
(x)[4] ^= (y)[4]; \
(x)[5] ^= (y)[5]; \
(x)[6] ^= (y)[6]; \
(x)[7] ^= (y)[7]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#endif // TK_SCHEDULE_BS_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[BLOCKBYTES], auth[BLOCKBYTES], sum [BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384(c, rtk2_3, m, rtk1);
xor_block(sum, m); // sum for tag computation
mlen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update lfsr for next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
for(i = 0; i < mlen; i++)
sum[i] ^= m[i]; // sum for tag computation
sum[i] ^= 0x80; // padding
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(auth, rtk2_3, auth, rtk1); // encrypt 'auth' = 0^16
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
memcpy(c, sum, TAGBYTES);
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(sum, rtk2_3, ad, rtk1); // use 'sum' as tmp array
xor_block(auth, sum);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
tkschedule_perm_tk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] = 0x80; // padding
skinny128_384(tmp, rtk2_3, tmp, rtk1);
xor_block(auth, tmp);
}
xor_block(c, auth); // XOR for tag computation
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
tkschedule_lfsr(rtk2_3, npub, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384_inv(m, rtk2_3, c, rtk1);
xor_block(sum, m); // sum for tag computation
clen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update LFSR for the next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(auth, rtk2_3, auth, rtk1);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
c += clen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(sum, rtk2_3, sum, rtk1); // compute the tag
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp);
skinny128_384(tmp + BLOCKBYTES, rtk2_3, ad, rtk1);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
tkschedule_perm_tk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] ^= 0x80; // padding
skinny128_384(tmp, rtk2_3, tmp, rtk1);
xor_block(auth, tmp);
}
xor_block(sum, auth); // XOR for tag computation
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 56
extern void skinny128_384(u8* ctext, const u32* rtk2_3, const u8* ptext, const u32* rtk1);
extern void skinny128_384_inv(u8* ptext, const u32* rtk2_3, const u8* ctext, const u32* rtk1);
extern void tkschedule_lfsr(u32* rtk2_3, const u8* tk2, const u8* tk3, const int rounds);
extern void tkschedule_perm(u32* rtk2_3);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1);
#endif // SKINNY128_H_
\ No newline at end of file
/*******************************************************************************
* ARM assembly implementation of fixsliced SKINNY-128-384.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
.syntax unified
.thumb
/*******************************************************************************
* applies P^2 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p2:
movw r1, #0xcc00
movt r1, #0xcc00 //r1 <- 0xcc00cc00
movw r10, #0xcc00
movt r10, #0x0033 //r10<- 0xcc000033
and r11, r1, r6, ror #14
bfi r11, r6, #16, #8
and r12, r6, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r6
orr r11, r11, r12, lsr #8
and r12, r6, #0x00cc0000
orr r6, r11, r12, lsr #18
and r11, r1, r7, ror #14
bfi r11, r7, #16, #8
and r12, r7, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r7
orr r11, r11, r12, lsr #8
and r12, r7, #0x00cc0000
orr r7, r11, r12, lsr #18
and r11, r1, r8, ror #14
bfi r11, r8, #16, #8
and r12, r8, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r8
orr r11, r11, r12, lsr #8
and r12, r8, #0x00cc0000
orr r8, r11, r12, lsr #18
and r11, r1, r9, ror #14
bfi r11, r9, #16, #8
and r12, r9, #0xcc000000
orr r11, r11, r12, lsr #2
and r12, r10, r9
orr r11, r11, r12, lsr #8
and r12, r9, #0x00cc0000
orr r9, r11, r12, lsr #18
bx lr
/*******************************************************************************
* applies P^4 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p4:
str.w r14, [sp] //store r14 on the stack
movw r14, #0x00cc
movt r14, #0xcc00 //r14<- 0xcc0000cc
movw r12, #0xcc00
movt r12, #0x3300 //r12<- 0x3300cc00
movw r11, #0x00cc
movt r11, #0x00cc //r11<- 0x00cc00cc
and r10, r14, r6, ror #22
and r1, r12, r6, ror #16
orr r10, r10, r1
and r1, r6, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r6, r6, r1
orr r6, r10, r6, ror #24
and r10, r14, r7, ror #22
and r1, r12, r7, ror #16
orr r10, r10, r1
and r1, r7, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r7, r7, r1
orr r7, r10, r7, ror #24
and r10, r14, r8, ror #22
and r1, r12, r8, ror #16
orr r10, r10, r1
and r1, r8, r11
orr r10, r10, r1, lsr #2
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r8, r8, r1
orr r8, r10, r8, ror #24
and r10, r14, r9, ror #22
ldr.w r14, [sp] //restore r14
and r12, r12, r9, ror #16
orr r10, r10, r12
and r12, r9, r11
orr r10, r10, r12, lsr #2
movw r12, #0xcc33 //r1 <- 0x0000cc33
and r9, r9, r12
orr r9, r10, r9, ror #24
bx lr
/*******************************************************************************
* applies P^6 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p6:
movw r1, #0x3333 //r1 <- 0x00003333
movw r12, #0x00cc
movt r12, #0x3300 //r12<- 0x330000cc
and r10, r6, r1, ror #8 // --- permute r6 6 times
and r11, r12, r6, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r6, ror #10
orr r11, r11, r10
and r10, r6, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r6, #0x00003300
orr r6, r11, r10, lsl #2 // permute r6 6 times ---
and r10, r7, r1, ror #8 // --- permute r7 6 times
and r11, r12, r7, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r7, ror #10
orr r11, r11, r10
and r10, r7, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r7, #0x00003300
orr r7, r11, r10, lsl #2 // permute r7 6 times ---
and r10, r8, r1, ror #8 // --- permute r8 6 times
and r11, r12, r8, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r8, ror #10
orr r11, r11, r10
and r10, r8, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r8, #0x00003300
orr r8, r11, r10, lsl #2 // permute r8 6 times ---
and r10, r9, r1, ror #8 // --- permute r9 6 times
and r11, r12, r9, ror #24
orr r11, r11, r10, ror #6
and r10, r1, r9, ror #10
orr r11, r11, r10
and r10, r9, #0x000000cc
orr r11, r11, r10, lsl #14
and r10, r9, #0x00003300 // permute r9 6 times ---
orr r9, r11, r10, lsl #2
bx lr
/*******************************************************************************
* applies P^8 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p8:
movw r12, #0x3333 //r12<- 0x00003333
movw r1, #0x0000
movt r1, #0x33cc //r1 <- 0x33cc0000
and r10, r6, r1 // --- permute r6 8 times
and r11, r1, r6, ror #8
orr r11, r11, r10, ror #24
and r10, r6, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r6, r12, lsl #8
orr r6, r11, r10, lsr #6 // permute r6 8 times ---
and r10, r7, r1 // --- permute r7 8 times
and r11, r1, r7, ror #8
orr r11, r11, r10, ror #24
and r10, r7, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r7, r12, lsl #8
orr r7, r11, r10, lsr #6 // permute r7 8 times ---
and r10, r8, r1 // --- permute r8 8 times
and r11, r1, r8, ror #8
orr r11, r11, r10, ror #24
and r10, r8, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r8, r12, lsl #8
orr r8, r11, r10, lsr #6 // permute r8 8 times ---
and r10, r9, r1 // --- permute r9 8 times
and r11, r1, r9, ror #8
orr r11, r11, r10, ror #24
and r10, r9, r12, lsl #2
orr r11, r11, r10, ror #26
and r10, r9, r12, lsl #8
orr r9, r11, r10, lsr #6 // permute r9 8 times ---
bx lr
/*******************************************************************************
* applies P^10 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p10:
movw r12, #0x0033
movt r12, #0x3300 //r12<- 0x33000033
movw r1, #0xcc33 //r1 <- 0x0000cc33
and r10, r6, r1, ror #8 // --- permute r6 10 times
and r11, r12, r6, ror #26
orr r11, r11, r10, ror #8
and r10, r6, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r6, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r6, #0x0000cc00
orr r6, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r7, r1, ror #8 // --- permute r6 10 times
and r11, r12, r7, ror #26
orr r11, r11, r10, ror #8
and r10, r7, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r7, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r7, #0x0000cc00
orr r7, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r8, r1, ror #8 // --- permute r6 10 times
and r11, r12, r8, ror #26
orr r11, r11, r10, ror #8
and r10, r8, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r8, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r8, #0x0000cc00
orr r8, r11, r10, lsr #2 // permute r6 10 times ---
and r10, r9, r1, ror #8 // --- permute r6 10 times
and r11, r12, r9, ror #26
orr r11, r11, r10, ror #8
and r10, r9, r12, ror #24
orr r11, r11, r10, ror #22
and r10, r9, #0x00330000
orr r11, r11, r10, lsr #14
and r10, r9, #0x0000cc00
orr r9, r11, r10, lsr #2 // permute r6 10 times ---
bx lr
/*******************************************************************************
* applies P^12 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p12:
str.w r14, [sp] //store r14 on the stack
movw r14, #0xcc33 //r14<- 0x0000cc33
movw r12, #0x00cc
movt r12, #0x00cc //r12<- 0x00cc00cc
movw r1, #0x3300
movt r1, #0xcc00 //r1 <- 0xcc003300
and r10, r14, r6, ror #8 // --- permute r6 12 times
and r11, r12, r6, ror #30
orr r11, r11, r10
and r10, r1, r6, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r6, r10, ror #8
orr r6, r11, r10, ror #10 // permute r6 12 times ---
and r10, r14, r7, ror #8 // --- permute r7 12 times
and r11, r12, r7, ror #30
orr r11, r11, r10
and r10, r1, r7, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r7, r10, ror #8
orr r7, r11, r10, ror #10 // permute r7 12 times ---
and r10, r14, r8, ror #8 // --- permute r8 12 times
and r11, r12, r8, ror #30
orr r11, r11, r10
and r10, r1, r8, ror #16
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r8, r10, ror #8
orr r8, r11, r10, ror #10 // permute r8 12 times ---
and r10, r14, r9, ror #8 // --- permute r9 12 times
and r11, r12, r9, ror #30
orr r11, r11, r10
and r10, r1, r9, ror #16
ldr.w r14, [sp]
orr r11, r11, r10
movw r10, #0xcccc //r10<- 0x0000cccc
and r10, r9, r10, ror #8
orr r9, r11, r10, ror #10 // permute r9 12 times ---
bx lr
/*******************************************************************************
* applies P^14 on the tweakey state in a bitsliced manner
*******************************************************************************/
.align 2
p14:
movw r1, #0xcc00
movt r1, #0x0033 //r1 <- 0x0033cc00
movw r12, #0xcc00
movt r12, #0xcc00 //r12<- 0x33003300
and r10, r1, r6, ror #24 // --- permute r6 14 times
and r11, r6, #0x00000033
orr r11, r10, r11, ror #14
and r10, r6, #0x33000000
orr r11, r11, r10, ror #30
and r10, r6, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r6, r12
orr r6, r11, r10, ror #18 // permute r6 14 times ---
and r10, r1, r7, ror #24 // --- permute r7 14 times
and r11, r7, #0x00000033
orr r11, r10, r11, ror #14
and r10, r7, #0x33000000
orr r11, r11, r10, ror #30
and r10, r7, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r7, r12
orr r7, r11, r10, ror #18 // permute r7 14 times ---
and r10, r1, r8, ror #24 // --- permute r8 14 times
and r11, r8, #0x00000033
orr r11, r10, r11, ror #14
and r10, r8, #0x33000000
orr r11, r11, r10, ror #30
and r10, r8, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r8, r12
orr r8, r11, r10, ror #18 // permute r8 14 times ---
and r10, r1, r9, ror #24 // --- permute r9 14 times
and r11, r9, #0x00000033
orr r11, r10, r11, ror #14
and r10, r9, #0x33000000
orr r11, r11, r10, ror #30
and r10, r9, #0x00ff0000
orr r11, r11, r10, ror #16
and r10, r9, r12
orr r9, r11, r10, ror #18 // permute r9 14 times ---
bx lr
.align 2
packing:
eor r12, r2, r2, lsr #3
and r12, r12, r10
eor r2, r2, r12
eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
eor r12, r3, r3, lsr #3
and r12, r12, r10
eor r3, r3, r12
eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r12, r4, r4, lsr #3
and r12, r12, r10
eor r4, r4, r12
eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r12, r5, r5, lsr #3
and r12, r12, r10
eor r5, r5, r12
eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r12, r2, r4, lsr #2
and r12, r12, r11
eor r2, r2, r12
eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r12, r2, r3, lsr #4
and r12, r12, r11, lsr #2
eor r2, r2, r12
eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r12, r2, r5, lsr #6
and r12, r12, r11, lsr #4
eor r2, r2, r12
eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r12, r4, r3, lsr #2
and r12, r12, r11, lsr #2
eor r4, r4, r12
eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r12, r4, r5, lsr #4
and r12, r12, r11, lsr #4
eor r4, r4, r12
eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r12, r3, r5, lsr #2
and r12, r12, r11, lsr #4
eor r3, r3, r12
eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
bx lr
.align 2
unpacking:
movw r6, #0x0a0a
movt r6, #0x0a0a //r6 <- 0x0a0a0a0a
eor r10, r3, r5, lsr #2
and r10, r10, r7, lsr #4
eor r3, r3, r10
eor r5, r5, r10, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2)
eor r10, r4, r5, lsr #4
and r10, r10, r7, lsr #4
eor r4, r4, r10
eor r5, r5, r10, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4)
eor r10, r4, r3, lsr #2
and r10, r10, r7, lsr #2
eor r4, r4, r10
eor r3, r3, r10, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2)
eor r10, r2, r5, lsr #6
and r10, r10, r7, lsr #4
eor r2, r2, r10
eor r5, r5, r10, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6)
eor r10, r2, r3, lsr #4
and r10, r10, r7, lsr #2
eor r2, r2, r10
eor r3, r3, r10, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4)
eor r10, r2, r4, lsr #2
and r10, r10, r7
eor r2, r2, r10
eor r4, r4, r10, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2)
eor r10, r5, r5, lsr #3
and r10, r10, r6
eor r5, r5, r10
eor r5, r5, r10, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3)
eor r10, r4, r4, lsr #3
and r10, r10, r6
eor r4, r4, r10
eor r4, r4, r10, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3)
eor r10, r3, r3, lsr #3
and r10, r10, r6
eor r3, r3, r10
eor r3, r3, r10, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3)
eor r10, r2, r2, lsr #3
and r10, r10, r6
eor r2, r2, r10
eor r2, r2, r10, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3)
bx lr
/******************************************************************************
* Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds.
* Performing both at the same time allows to save some memory accesses.
******************************************************************************/
@ void tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds)
.global tkschedule_lfsr
.type tkschedule_lfsr,%function
.align 2
tkschedule_lfsr:
push {r0-r12, r14}
ldr.w r3, [r1, #8] //load tk2 (3rd word)
ldr.w r4, [r1, #4] //load tk2 (2nd word)
ldr.w r5, [r1, #12] //load tk2 (4th word)
ldr.w r12, [r1] //load tk2 (1st word)
mov r1, r2 //move tk3 address in r1
mov r2, r12 //move 1st tk2 word in r2
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk2
mov r6, r2 //move tk2 from r2-r5 to r6-r9
mov r7, r3 //move tk2 from r2-r5 to r6-r9
mov r8, r4 //move tk2 from r2-r5 to r6-r9
mov r9, r5 //move tk2 from r2-r5 to r6-r9
ldr.w r3, [r1, #8] //load tk3 (3rd word)
ldr.w r4, [r1, #4] //load tk3 (2nd word)
ldr.w r5, [r1, #12] //load tk3 (4th) word)
ldr.w r2, [r1] //load tk3 (1st) word)
bl packing //pack tk3
eor r10, r10, r10, lsl #4 //r10<- 0xaaaaaaaa
ldr.w r1, [sp, #12] //load loop counter in r1
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #8 //store in tk
loop:
and r12, r8, r10 // --- apply LFSR2 to tk2
eor r12, r12, r6
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r6, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r3, r10 // --- apply LFSR3 to tk3
eor r12, r5, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r5, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r5, r7 //tk2 ^ tk3 (1st word)
eor r12, r2, r8 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r3, r9 //tk2 ^ tk3 (3rd word)
eor r12, r4, r6 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r9, r10 // --- apply LFSR2 to tk2
eor r12, r12, r7
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r7, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r2, r10 // --- apply LFSR3 to tk3
eor r12, r4, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r4, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r4, r8 //tk2 ^ tk3 (1st word)
eor r12, r5, r9 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r2, r6 //tk2 ^ tk3 (3rd word)
eor r12, r3, r7 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r6, r10 // --- apply LFSR2 to tk2
eor r12, r12, r8
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r8, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r5, r10 // --- apply LFSR3 to tk3
eor r12, r3, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r3, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r3, r9 //tk2 ^ tk3 (1st word)
eor r12, r4, r6 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r5, r7 //tk2 ^ tk3 (3rd word)
eor r12, r2, r8 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
and r12, r7, r10 // --- apply LFSR2 to tk2
eor r12, r12, r9
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r9, r14, r12, lsr #1 // apply LFSR2 to tk2 ---
and r12, r4, r10 // --- apply LFSR3 to tk3
eor r12, r2, r12, lsr #1
and r14, r10, r12, lsl #1
and r12, r12, r10
orr r2, r14, r12, lsr #1 // apply LFSR3 to tk3 ---
eor r11, r2, r6 //tk2 ^ tk3 (1st word)
eor r12, r3, r7 //tk2 ^ tk3 (2nd word)
strd r11, r12, [r0], #8 //store in tk
eor r11, r4, r8 //tk2 ^ tk3 (3rd word)
eor r12, r5, r9 //tk2 ^ tk3 (4th word)
strd r11, r12, [r0], #24 //store in tk
subs.w r1, r1, #8 //decrease loop counter by 8
bne loop
pop {r0-r12, r14}
bx lr
/******************************************************************************
* Applies the permutation P and add the round constants to all round tweakeys.
******************************************************************************/
@ void tkschedule_perm(u32* tk)
.global tkschedule_perm
.type tkschedule_perm,%function
.align 2
tkschedule_perm:
push {r0-r12, lr}
sub.w sp, #4 //to store r14 in subroutines
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r6, r6, r10 //tk &= 0xf0f0f0f0 (1st word)
and r7, r7, r10 //tk &= 0xf0f0f0f0 (2nd word)
and r8, r8, r10 //tk &= 0xf0f0f0f0 (3rd word)
and r9, r9, r10 //tk &= 0xf0f0f0f0 (4th word)
eor r8, r8, #0x00000004 //add rconst
eor r9, r9, #0x00000040 //add rconst
mvn r9, r9 //to remove a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 1st round
strd r6, r7, [r0], #8 //store 2nd half tk for 1st round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 3rd round
strd r6, r7, [r0], #8 //store 2nd half tk for 3rd round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 5th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 5th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 4th round
strd r8, r9, [r0], #24 //store 2nd half tk for 4th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 7th round
strd r6, r7, [r0], #8 //store 2nd half tk for 7th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 9th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000054 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 9th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000410
eor r9, r9, #0x00000410
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 8th round
strd r8, r9, [r0], #24 //store 2nd half tk for 8th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r6, r6, #0x04000000 //add rconst
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 11th round
strd r6, r7, [r0], #8 //store 2nd half tk for 11th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 13th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00140000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 13th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 12th round
strd r8, r9, [r0], #24 //store 2nd half tk for 12th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
eor r9, r9, #0x00000400 //add const
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 15th round
strd r6, r7, [r0], #8 //store 2nd half tk for 15th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 17th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 17th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 16th round
strd r8, r9, [r0], #24 //store 2nd half tk for 16th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 18th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 18th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
eor r9, r9, #0x04000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 19th round
strd r6, r7, [r0], #8 //store 2nd half tk for 19th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 21th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00100000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 21th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x04100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 20th round
strd r8, r9, [r0], #24 //store 2nd half tk for 20th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 22th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 22th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 23th round
strd r6, r7, [r0], #8 //store 2nd half tk for 23th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 25th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 25th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 24th round
strd r8, r9, [r0], #24 //store 2nd half tk for 24th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 26th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 26th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x40000000 //add rconst
eor r9, r9, #0x04000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 27th round
strd r6, r7, [r0], #8 //store 2nd half tk for 27th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r12, r12, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 29th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 29th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r8, r8, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 28th round
strd r8, r9, [r0], #24 //store 2nd half tk for 28th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01000000 //add rconst
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 30th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 30th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x01000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 31th round
strd r6, r7, [r0], #8 //store 2nd half tk for 31th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #24] //store 2nd half tk for 33th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000050 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 33th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6 ,r6, #0x00000010
eor r8, r8, #0x00010400
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 32th round
strd r8, r9, [r0], #24 //store 2nd half tk for 32th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 34th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000140 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 34th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 35th round
strd r6, r7, [r0], #8 //store 2nd half tk for 35th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 37th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00440000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 37th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
eor r9, r9, #0x00100000 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 36th round
strd r8, r9, [r0], #24 //store 2nd half tk for 36th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
eor r12, r12, #0x01000000 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 38th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 38th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000400 //add rconst
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 39th round
strd r6, r7, [r0], #8 //store 2nd half tk for 39th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r12, r12, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 41th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000014 //add rconst
eor r12, r12, #0x00000010 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 41th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
eor r9, r9, #0x00000400
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 40th round
strd r8, r9, [r0], #24 //store 2nd half tk for 40th round
ldm r0, {r6-r9} //load tk
bl p10 //apply the permutation 10 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x00000100 //add rconst
strd r11, r12, [r0], #8 //store 1st half tk for 42th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000040 //add rconst
eor r12, r12, #0x00000100 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 42th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r8, r8, #0x44000000 //add rconst
eor r9, r9, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 43th round
strd r6, r7, [r0], #8 //store 2nd half tk for 43th round
ldm r0, {r6-r9} //load tk
bl p12 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 45th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00040000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 45th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x04000000 //add rconst
eor r8, r8, #0x00000001 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 44th round
strd r8, r9, [r0], #24 //store 2nd half tk for 44th round
ldm r0, {r6-r9} //load tk
bl p14 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 46th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x01400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 46th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r8, r8, #0x01000000 //add rconst
eor r8, r8, #0x00004400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 47th round
strd r6, r7, [r0], #8 //store 2nd half tk for 47th round
ldm r0, {r6-r9} //load tk
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6 //ror and mask to match fixslicing
and r12, r10, r7 //ror and mask to match fixslicing
eor r11, r11, #0x00000040 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 49th round
and r11, r10, r8 //ror and mask to match fixslicing
and r12, r10, r9 //ror and mask to match fixslicing
eor r11, r11, #0x00000004 //add rconst
eor r12, r12, #0x00000040 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 49th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r7 ,r7, #0x00000010
eor r8, r8, #0x00010000
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 48th round
strd r8, r9, [r0], #24 //store 2nd half tk for 48th round
ldm r0, {r6-r9} //load tk
bl p2 //apply the permutation twice
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #26 //ror and mask to match fixslicing
and r12, r10, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 50th round
and r11, r10, r8, ror #26 //ror and mask to match fixslicing
and r12, r10, r9, ror #26 //ror and mask to match fixslicing
eor r11, r11, #0x10000000 //add rconst
eor r11, r11, #0x00000100 //add rconst
eor r12, r12, #0x00000140 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 50th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #12
and r11, r10, r7, ror #28
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #12
and r11, r10, r8, ror #28
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #12
and r11, r10, r9, ror #28
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #12 //ror and masks to match fixslicing ---
eor r7, r7, #0x04000000 //add rconst
eor r8, r8, #0x44000000 //add rconst
eor r8, r8, #0x00000100 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 51th round
strd r6, r7, [r0], #8 //store 2nd half tk for 51th round
ldm r0, {r6-r9} //load tk
bl p4 //apply the permutation 4 times
movw r10, #0xf0f0
movt r10, #0xf0f0 //r10<- 0xf0f0f0f0
and r11, r10, r6, ror #16 //ror and mask to match fixslicing
and r12, r10, r7, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00400000 //add rconst
strd r11, r12, [r0, #24] //store 2nd half tk for 53th round
and r11, r10, r8, ror #16 //ror and mask to match fixslicing
and r12, r10, r9, ror #16 //ror and mask to match fixslicing
eor r11, r11, #0x00040000 //add rconst
eor r12, r12, #0x00500000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0, #16] //store 1st half tk for 53th round
and r10, r10, r10, lsr #2 //r10<- 0x30303030
and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #6
and r11, r10, r7, ror #14
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #6
and r11, r10, r8, ror #14
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #6
and r11, r10, r9, ror #14
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #6 //ror and masks to match fixslicing ---
eor r6, r6, #0x00100000 //add rconst
eor r7, r7, #0x00100000 //add rconst
eor r8, r8, #0x00000001 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 52th round
strd r8, r9, [r0], #24 //store 2nd half tk for 52th round
ldm r0, {r6-r9} //load tk
bl p6 //apply the permutation 6 times
movw r10, #0xc3c3
movt r10, #0xc3c3 //r10<- 0xc3c3c3c3
and r11, r10, r6, ror #10 //ror and mask to match fixslicing
and r12, r10, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 54th round
and r11, r10, r8, ror #10 //ror and mask to match fixslicing
and r12, r10, r9, ror #10 //ror and mask to match fixslicing
eor r11, r11, #0x01400000 //add rconst
eor r11, r11, #0x00001000 //add rconst
eor r12, r12, #0x00400000 //add rconst
mvn r12, r12 //to save a NOT in sbox calculations
strd r11, r12, [r0], #8 //store 2nd half tk for 54th round
and r10, r10, r10, lsr #6 //r10<- 0x03030303
and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing
and r6, r6, r10, lsl #6
orr r6, r11, r6, ror #28
and r11, r10, r7, ror #12
and r7, r7, r10, lsl #6
orr r7, r11, r7, ror #28
and r11, r10, r8, ror #12
and r8, r8, r10, lsl #6
orr r8, r11, r8, ror #28
and r11, r10, r9, ror #12
and r9, r9, r10, lsl #6
orr r9, r11, r9, ror #28 //ror and masks to match fixslicing ---
eor r7, r7, #0x00000400 //add rconst
eor r8, r8, #0x01000000
eor r8, r8, #0x00004000 //add rconst
eor r9, r9, #0x00000400 //add rconst
mvn r9, r9 //to save a NOT in sbox calculations
strd r8, r9, [r0], #8 //store 1st half tk for 55th round
strd r6, r7, [r0], #8 //store 2nd half tk for 55th round
ldm r0, {r6-r9} //load tk
bl p8 //apply the permutation 8 times
movw r10, #0x3030
movt r10, #0x3030 //r10<- 0x30303030
and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing
and r6, r6, r10, ror #4
orr r6, r11, r6, ror #22
and r11, r10, r7, ror #30
and r7, r7, r10, ror #4
orr r7, r11, r7, ror #22
and r11, r10, r8, ror #30
and r8, r8, r10, ror #4
orr r8, r11, r8, ror #22
and r11, r10, r9, ror #30
and r9, r9, r10, ror #4
orr r9, r11, r9, ror #22 //ror and masks to match fixslicing ---
eor r6, r6, #0x00000010
eor r8, r8, #0x00010000
eor r8, r8, #0x00000010
mvn r9, r9 //to save a NOT in sbox calculations
strd r6, r7, [r0], #8 //store 1st half tk for 56th round
strd r8, r9, [r0], #24 //store 2nd half tk for 56th round
add.w sp, #4
pop {r0-r12, lr}
bx lr
/******************************************************************************
* Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we
* don't need more calculations as no LFSR is applied to TK1.
******************************************************************************/
@ void tkschedule_perm_tk1(u32* tk, const u8* key)
.global tkschedule_perm_tk1
.type tkschedule_perm_tk1,%function
.align 2
tkschedule_perm_tk1:
push {r0-r12, lr}
ldr.w r3, [r1, #8] //load tk1 (3rd word)
ldr.w r4, [r1, #4] //load tk1 (2nd word)
ldr.w r5, [r1, #12] //load tk1 (4th word)
ldr.w r2, [r1] //load tk1 (1st word)
movw r10, #0x0a0a
movt r10, #0x0a0a //r6 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r7 <- 0x30303030
bl packing //pack tk1
mov r6, r2 //move tk1 from r2-r5 to r6-r9
mov r7, r3 //move tk1 from r2-r5 to r6-r9
mov r8, r4 //move tk1 from r2-r5 to r6-r9
mov r9, r5 //move tk1 from r2-r5 to r6-r9
movw r2, #0xf0f0
movt r2, #0xf0f0 //r2<- 0xf0f0f0f0
and r11, r8, r2 //tk &= 0xf0f0f0f0 (3rd word)
and r12, r9, r2 //tk &= 0xf0f0f0f0 (4th word)
strd r11, r12, [r0], #8 //store 1st half tk for 1st round
and r11, r6, r2 //tk &= 0xf0f0f0f0 (1st word)
and r12, r7, r2 //tk &= 0xf0f0f0f0 (2nd word)
strd r11, r12, [r0], #8 //store 2nd half tk for 1st round
movw r3, #0x3030
movt r3, #0x3030 //r3 <- 0x30303030
and r11, r3, r6, ror #30 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0, #224]
and r11, r3, r7, ror #30
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0, #228]
and r11, r3, r8, ror #30
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0, #232]
and r11, r3, r9, ror #30
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #22 //ror and masks to match fixslicing ---
str.w r12, [r0, #236]
bl p2 //apply the permutation twice
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3
and r11, r3, r6, ror #26 //ror and mask to match fixslicing
and r12, r3, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 2nd round
and r11, r3, r8, ror #26 //ror and mask to match fixslicing
and r12, r3, r9, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round
and r3, r3, r3, lsr #6 //r3<- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 4 times
lsl r3, r3, #4 //r3 <- 0x30303030
and r11, r3, r6, ror #14 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r7, ror #14
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r8, ror #14
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r9, ror #14
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #6 //ror and masks to match fixslicing ---
str.w r12, [r0], #4
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation twice
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3<- 0xc3c3c3c3
and r11, r3, r6, ror #10 //ror and mask to match fixslicing
and r12, r3, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 6th round
and r11, r3, r8, ror #10 //ror and mask to match fixslicing
and r12, r3, r9, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 6th round
and r3, r3, r3, lsr #6 //r3<- 0x03030303
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 8 times
lsl r3, r3, #4 //r3 <- 0x30303030
and r11, r3, r6, ror #30 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0], #4
and r11, r3, r7, ror #30
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0], #4
and r11, r3, r8, ror #30
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #22
str.w r12, [r0], #4
and r11, r3, r9, ror #30
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #22 //ror and masks to match fixslicing ---
str.w r12, [r0], #4
and r11, r2, r6 //ror and mask to match fixslicing
and r12, r2, r7 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 9th round
and r11, r2, r8 //ror and mask to match fixslicing
and r12, r2, r9 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 9th round
bl p2 //apply the permutation 10
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3
and r11, r3, r6, ror #26 //ror and mask to match fixslicing
and r12, r3, r7, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 10th round
and r11, r3, r8, ror #26 //ror and mask to match fixslicing
and r12, r3, r9, ror #26 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 10th round
and r3, r3, r3, lsr #6 //r3 <- 0x03030303
and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #8]
and r11, r3, r7, ror #28
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #12]
and r11, r3, r9, ror #28
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0, #4]
and r11, r3, r8, ror #28
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #12
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
bl p2 //apply the permutation 12 times
lsl r3, r3, #4 //r3 <- 0x30303030
and r11, r3, r6, ror #14 //--- ror and masks to match fixslicing
and r12, r6, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r7, ror #14
and r12, r7, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r8, ror #14
and r12, r8, r3, ror #4
orr r12, r11, r12, ror #6
str.w r12, [r0], #4
and r11, r3, r9, ror #14
and r12, r9, r3, ror #4
orr r12, r11, r12, ror #6 //ror and masks to match fixslicing ---
str.w r12, [r0], #4
and r11, r2, r6, ror #16 //ror and mask to match fixslicing
and r12, r2, r7, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0, #8] //store 2nd half tk for 5th round
and r11, r2, r8, ror #16 //ror and mask to match fixslicing
and r12, r2, r9, ror #16 //ror and mask to match fixslicing
strd r11, r12, [r0], #16 //store 1st half tk for 5th round
bl p2 //apply the permutation 14 times
movw r3, #0xc3c3
movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3
and r11, r3, r6, ror #10 //ror and mask to match fixslicing
and r12, r3, r7, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 1st half tk for 14th round
and r11, r3, r8, ror #10 //ror and mask to match fixslicing
and r12, r3, r9, ror #10 //ror and mask to match fixslicing
strd r11, r12, [r0], #8 //store 2nd half tk for 14th round
and r3, r3, r3, lsr #6 //r3 <- 0x03030303
and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing
and r12, r6, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #8]
and r11, r3, r7, ror #12
and r12, r7, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #12]
and r11, r3, r9, ror #12
and r12, r9, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0, #4]
and r11, r3, r8, ror #12
and r12, r8, r3, lsl #6
orr r12, r11, r12, ror #28
str.w r12, [r0], #16 //ror and masks to match fixslicing ---
pop {r0-r12, lr}
bx lr
.align 2
quadruple_round:
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #30 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #24
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #30
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #2
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rtk_2_3 + rconst
eor r3, r3, r9 //add rtk_2_3 + rconst
eor r4, r4, r10 //add rtk_2_3 + rconst
eor r5, r5, r11 //add rtk_2_3 + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #10 // --- mixcolumns 2 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #26
eor r2, r2, r8
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #26
eor r5, r5, r8
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldmia.w r0!,{r8-r11}
eor r2, r2, r8 //add rtk_1
eor r3, r3, r9 //add rtk_1
eor r4, r4, r10 //add rtk_1
eor r5, r5, r11 //add rtk_1
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #26
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #22
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
bx lr
/******************************************************************************
* Inverse quadruple round of fixsliced SKINNY-128.
******************************************************************************/
.align 2
inv_quadruple_round:
and r8, r7, r2, ror #4 // --- mixcolumns 3 ---
eor r2, r2, r8, ror #22
and r8, r7, r2
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #4
eor r2, r2, r8, ror #26
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #22
and r8, r7, r3
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #4
eor r3, r3, r8, ror #26
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #22
and r8, r7, r4
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #4
eor r4, r4, r8, ror #26
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #22
and r8, r7, r5
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #4
eor r5, r5, r8, ror #26
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
and r8, r7, r2, ror #26 // --- mixcolumns 2 ---
eor r2, r2, r8
and r8, r7, r2, ror #6
eor r2, r2, r8, ror #6
and r8, r7, r2, ror #10
eor r2, r2, r8, ror #4
and r8, r7, r3, ror #26
eor r3, r3, r8
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #6
and r8, r7, r3, ror #10
eor r3, r3, r8, ror #4
and r8, r7, r4, ror #26
eor r4, r4, r8
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #6
and r8, r7, r4, ror #10
eor r4, r4, r8, ror #4
and r8, r7, r5, ror #26
eor r5, r5, r8
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #6
and r8, r7, r5, ror #10
eor r5, r5, r8, ror #4
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rk2_3 + rconst
eor r3, r3, r9 //add rk2_3 + rconst
eor r4, r4, r10 //add rk2_3 + rconst
eor r5, r5, r11 //add rk2_3 + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
and r8, r7, r2, ror #16 // --- mixcolumns 1 ---
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #28
eor r2, r2, r8
and r8, r7, r2, ror #16
eor r2, r2, r8, ror #30
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #28
eor r3, r3, r8
and r8, r7, r3, ror #16
eor r3, r3, r8, ror #30
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #28
eor r4, r4, r8
and r8, r7, r4, ror #16
eor r4, r4, r8, ror #30
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #28
eor r5, r5, r8
and r8, r7, r5, ror #16
eor r5, r5, r8, ror #30
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r3, r4
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0);
orr r8, r2, r3
eor r5, r5, r8
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
and r8, r7, r2, ror #6 // --- mixcolumns 0 ---
eor r2, r2, r8, ror #4
and r8, r7, r2, ror #18
eor r2, r2, r8, ror #2
and r8, r7, r2, ror #30
eor r2, r2, r8, ror #24
and r8, r7, r3, ror #6
eor r3, r3, r8, ror #4
and r8, r7, r3, ror #18
eor r3, r3, r8, ror #2
and r8, r7, r3, ror #30
eor r3, r3, r8, ror #24
and r8, r7, r4, ror #6
eor r4, r4, r8, ror #4
and r8, r7, r4, ror #18
eor r4, r4, r8, ror #2
and r8, r7, r4, ror #30
eor r4, r4, r8, ror #24
and r8, r7, r5, ror #6
eor r5, r5, r8, ror #4
and r8, r7, r5, ror #18
eor r5, r5, r8, ror #2
and r8, r7, r5, ror #30
eor r5, r5, r8, ror #24
ldrd r10, r11, [r1], #-8
ldrd r8, r9, [r1], #-8
eor r2, r2, r8 //add rkey + rconst
eor r3, r3, r9 //add rkey + rconst
eor r4, r4, r10 //add rkey + rconst
eor r5, r5, r11 //add rkey + rconst
ldrd r10, r11, [r0], #-8
ldrd r8, r9, [r0], #-8
eor r2, r2, r8 //add rtk1
eor r3, r3, r9 //add rtk1
eor r4, r4, r10 //add rtk1
eor r5, r5, r11 //add rtk1
eor r8, r2, r5
and r8, r8, r6
eor r2, r2, r8
eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0);
orr r8, r4, r5
eor r3, r3, r8
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
eor r8, r5, r2, lsr #1
and r8, r8, r6
eor r5, r5, r8
eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1);
eor r8, r2, r3, lsr #1
and r8, r8, r6
eor r2, r2, r8
eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1);
orr r8, r4, r5
eor r3, r3, r8
mvn r3, r3
eor r8, r4, r5, lsr #1
and r8, r8, r6
eor r4, r4, r8
eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1);
eor r8, r3, r4, lsr #1
and r8, r8, r6
eor r3, r3, r8
eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1);
orr r8, r2, r3
eor r5, r5, r8
mvn r5, r5
bx lr
/******************************************************************************
* Encrypt a single block using fixsliced SKINNY-128-128.
******************************************************************************/
@ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
.global skinny128_384
.type skinny128_384,%function
.align 2
skinny128_384:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r11 <- 0x30303030
bl packing
mov r7, r11
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #256 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #256 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl quadruple_round
bl quadruple_round
sub.w r0, #256 // rtk1 repeats every 16 rounds
bl quadruple_round
bl quadruple_round
bl unpacking
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
/******************************************************************************
* Decrypt a single block using fixsliced SKINNY-128-128.
******************************************************************************/
@ void skinny128_384_inv(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1)
.global skinny128_384_inv
.type skinny128_384_inv,%function
.align 2
skinny128_384_inv:
push {r0-r12, r14}
mov.w r0, r3
ldr.w r3, [r2, #8]
ldr.w r4, [r2, #4]
ldr.w r5, [r2, #12]
ldr.w r2, [r2]
movw r10, #0x0a0a
movt r10, #0x0a0a //r10 <- 0x0a0a0a0a
movw r11, #0x3030
movt r11, #0x3030 //r11 <- 0x30303030
bl packing
mov r7, r11
movw r6, #0x5555
movt r6, #0x5555 //r6 <- 0x55555555
add.w r0, #120 // points to the right rtk1
add.w r1, #888
bl inv_quadruple_round
bl inv_quadruple_round
add.w r0, #256 // rtk1 repeats every 16 rounds
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
add.w r0, #256 // rtk1 repeats every 16 rounds
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
add.w r0, #256 // rtk1 repeats every 16 rounds
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
bl inv_quadruple_round
bl unpacking
ldr.w r0, [sp], #4
strd r2, r4, [r0]
strd r3, r5, [r0, #8]
pop {r1-r12,r14}
bx lr
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
//x ^= y with x, y 128-bit blocks
#define XOR_BLOCK(x,y) ({ \
((u32*)(x))[0] ^= ((u32*)(y))[0]; \
((u32*)(x))[1] ^= ((u32*)(y))[1]; \
((u32*)(x))[2] ^= ((u32*)(y))[2]; \
((u32*)(x))[3] ^= ((u32*)(y))[3]; \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
//API required by the NIST for the LWC competition
int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec, const unsigned char *npub,
const unsigned char *k);
//API required by the NIST for the LWC competition
int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub, const unsigned char *k);
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
* functions.
******************************************************************************/
static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, u32* rtk1,
u32* rtk2_3, u64 mlen, const u8* ad, u64 adlen) {
u64 lfsr = 1;
u8 feedback;
u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02);
while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
tkschedule_perm_tk1(rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, rtk1, rtk2_3);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > BLOCKBYTES) { // pad and process 2 blocs
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x03); // domain for padding ad
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
memset(tmp, 0x00, BLOCKBYTES);
memcpy(tmp, ad + BLOCKBYTES, adlen);
tmp[adlen] ^= 0x80; // padding
skinny128_384(tmp + BLOCKBYTES, tmp, ad, tmp, rtk1, rtk2_3);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
} else if (adlen == BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tag);
skinny128_384(auth, c, ad, c, rtk1, rtk2_3);
} else { // if tag has been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tmp); // process last ad block
skinny128_384(auth, auth, ad, ad, rtk1, rtk2_3);
}
} else if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
memcpy(tmp + BLOCKBYTES, ad, adlen); // padding
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tag); // compute the tag
skinny128_384(auth, c, tmp + BLOCKBYTES, c, rtk1, rtk2_3);
} else { // if tag has been calculated yet
tkschedule_perm_tk1(rtk1, tmp, tmp); // process last ad block
skinny128_384(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, rtk1, rtk2_3);
}
}
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u8 feedback;
u64 i,lfsr = 1;
u32 rtk1[8*16];
u32 rtk2_3[8*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(c, c + BLOCKBYTES, m, m + BLOCKBYTES, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
xor_block(sum, m + BLOCKBYTES); // sum for tag computation
mlen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (mlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x01); // domain for padding m
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(c, auth, m, auth, rtk1, rtk2_3);
xor_block(sum, m);
for(i = 0; i < mlen - BLOCKBYTES; i++) {
c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
sum[i] ^= m[BLOCKBYTES + i];
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
m += mlen;
c += mlen;
mlen = 0;
UPDATE_LFSR(lfsr);
} else if (mlen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x04); // domain for tag computation
xor_block(sum, m); // sum for tag computation
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(c, sum, m, sum, rtk1, rtk2_3);
c += BLOCKBYTES;
} else if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x01); // domain for padding
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x05); // domain for tag computation
for(i = 0; i < mlen; i++) // sum for tag computation
sum[i] ^= m[i]; // sum for tag computation
sum[i] ^= 0x80; // padding
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384(auth, sum, auth, sum, rtk1, rtk2_3);
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
}
if (mlen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr); // lfsr for tag computation
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
tkschedule_perm_tk1(rtk1, tag, tag);
skinny128_384(sum, sum, sum, sum, rtk1, rtk2_3); // compute the tag
}
}
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, mlen, ad, adlen);
xor_block(sum, auth);
memcpy(c, sum, TAGBYTES);
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Decryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u8 feedback;
u64 i,lfsr = 1;
u32 rtk1[8*16];
u32 rtk2_3[8*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES];
u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
tkschedule_lfsr_2(rtk2_3, npub, npub, SKINNY128_384_ROUNDS);
tkschedule_lfsr_3(rtk2_3, k, k, SKINNY128_384_ROUNDS);
tkschedule_perm(rtk2_3);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_inv(m, m + BLOCKBYTES, c, c + BLOCKBYTES, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
xor_block(sum, m + BLOCKBYTES); // sum for tag computation
clen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (clen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
xor_block(sum, m);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x01); // domain for padding m
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
for(i = 0; i < clen - BLOCKBYTES; i++) {
m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
sum[i] ^= m[BLOCKBYTES + i];
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
c += clen;
clen = 0;
UPDATE_LFSR(lfsr);
} else if (clen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr);
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384_inv(m, m, c, c, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
SET_DOMAIN(tag, 0x04); // domain for tag computation
UPDATE_LFSR(lfsr);
c += BLOCKBYTES;
clen = 0;
} else if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x01); // domain for padding
tkschedule_perm_tk1(rtk1, tmp, tmp);
skinny128_384(auth, auth, auth, auth, rtk1, rtk2_3);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
c += clen;
clen = 0;
}
if (clen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr);
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
tkschedule_perm_tk1(rtk1, tag, tag); //if AD can be processed in //
skinny128_384(sum, sum, sum, sum, rtk1, rtk2_3); // compute the tag
}
}
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, sum, tag, rtk1, rtk2_3, clen, ad, adlen);
xor_block(sum, auth);
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
typedef unsigned char u8;
typedef unsigned int u32;
#define SKINNY128_384_ROUNDS 56
extern void skinny128_384(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const u32* rtk1, const u32* rtk2_3);
extern void skinny128_384_inv(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const u32* rtk1, const u32* rtk2_3);
extern void tkschedule_lfsr_2(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
extern void pack_tk1(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds);
extern void tkschedule_lfsr_3(u32* rtk, const u8* tk3, const u8* tk3_bis, const int rounds);
extern void tkschedule_perm(u32* rtk);
extern void tkschedule_perm_tk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
#endif // SKINNY128_H_
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
//x ^= y with x, y 128-bit blocks
#define XOR_BLOCK(x,y) ({ \
((u32*)(x))[0] ^= ((u32*)(y))[0]; \
((u32*)(x))[1] ^= ((u32*)(y))[1]; \
((u32*)(x))[2] ^= ((u32*)(y))[2]; \
((u32*)(x))[3] ^= ((u32*)(y))[3]; \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
precompute_rtk2_3(rtk2_3, npub, k);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(c + mlen, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384_encrypt(c, m, rtk1, rtk2_3);
xor_block(c + mlen, m); // sum for tag computation
mlen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update lfsr for next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
for(i = 0; i < mlen; i++)
c[mlen + i] ^= m[i]; // sum for tag computation
c[mlen + i] ^= 0x80; // padding
precompute_rtk1(rtk1, tmp);
skinny128_384_encrypt(auth, auth, rtk1, rtk2_3);
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
precompute_rtk1(rtk1, tmp);
for(int i = 0; i < 16; i++) {
printf("%08x %08x %08x %08x\n",rtk1[i*4], rtk1[i*4+1],rtk1[i*4+2],rtk1[i*4+3]);
}
for(int i = 0; i < 56; i++) {
printf("%08x %08x %08x %08x\n",rtk2_3[i*4], rtk2_3[i*4+1],rtk2_3[i*4+2],rtk2_3[i*4+3]);
}
skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp);
skinny128_384_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
precompute_rtk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] ^= 0x80; // padding
skinny128_384_encrypt(tmp, tmp, rtk1, rtk2_3);
xor_block(auth, tmp);
}
xor_block(c, auth); // XOR for tag computation
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
u32 rtk1[4*16];
u32 rtk2_3[4*SKINNY128_384_ROUNDS];
u8 tmp[2*BLOCKBYTES], auth[BLOCKBYTES], sum[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
precompute_rtk2_3(rtk2_3, npub, k);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= BLOCKBYTES) { // while entire blocks to process
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp); // precompute RTK1 given the LFSR
skinny128_384_decrypt(m, c, rtk1, rtk2_3);
xor_block(sum, m); // sum for tag computation
clen -= BLOCKBYTES;
c += BLOCKBYTES;
m += BLOCKBYTES;
UPDATE_LFSR(lfsr); // update LFSR for the next block
}
SET_DOMAIN(tmp, 0x04); // domain for tag computation
if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
precompute_rtk1(rtk1, tmp);
skinny128_384_encrypt(auth, auth, rtk1, rtk2_3);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
c += clen;
SET_DOMAIN(tmp, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
}
LE_STR_64(tmp, lfsr); // lfsr for tag computation
precompute_rtk1(rtk1, tmp);
skinny128_384_encrypt(sum, sum, rtk1, rtk2_3); // compute the tag
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
lfsr = 1;
SET_DOMAIN(tmp, 0x02);
memset(auth, 0x00, BLOCKBYTES);
while (adlen >= BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
precompute_rtk1(rtk1, tmp);
skinny128_384_encrypt(tmp + BLOCKBYTES, ad, rtk1, rtk2_3);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
ad += BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
precompute_rtk1(rtk1, tmp);
memset(tmp, 0x00, BLOCKBYTES); // padding
memcpy(tmp, ad, adlen); // padding
tmp[adlen] ^= 0x80; // padding
skinny128_384_encrypt(tmp, tmp, rtk1, rtk2_3);
xor_block(auth, tmp);
}
xor_block(sum, auth); // XOR for tag computation
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of the SKINNY tweakable block ciphers.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* The Sbox computation takes advantage of some symmetry in the 8-bit Sbox to
* turn it into a 4-bit S-box computation. Although the last bit permutation
* within the Sbox is not computed, the bit ordering is synchronized with the
* classical representation after 2 calls.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
******************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 1
******************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 2
******************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 3
******************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
}
}
/******************************************************************************
* The inverse MixColumns operation for rounds i such that (i % 4) == 0
******************************************************************************/
void inv_mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 4; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
}
}
/******************************************************************************
* Encryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
QUADRUPLE_ROUND(state, rtk1, rtk2_3);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
unpacking(ctext, state); // from bitsliced to byte representation
}
/******************************************************************************
* Decryption of a single block without any operation mode using SKINNY-128-384.
* RTK1 and RTK2_3 are given separately to take advantage of the fact that
* TK2 and TK3 remains the same through the entire data encryption/decryption.
******************************************************************************/
void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
const u32* rtk2_3) {
u32 tmp; // used in SWAPMOVE macro
u32 state[4]; // 128-bit state
packing(state, ptext); // from byte to bitsliced representation
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+192);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64);
INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48);
INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32);
INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16);
INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3);
unpacking(ctext, state); // from bitsliced to byte representation
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
#include "tk_schedule.h"
void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2_3);
#define SKINNY128_384_ROUNDS 56
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
mixcolumns_0(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
mixcolumns_1(state); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
mixcolumns_2(state); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
mixcolumns_3(state); \
})
#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
inv_mixcolumns_3(state); \
state[0] ^= (rtk1)[12]; \
state[1] ^= (rtk1)[13]; \
state[2] ^= (rtk1)[14]; \
state[3] ^= (rtk1)[15]; \
state[0] ^= (rtk2_3)[12]; \
state[1] ^= (rtk2_3)[13]; \
state[2] ^= (rtk2_3)[14]; \
state[3] ^= (rtk2_3)[15]; \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
inv_mixcolumns_2(state); \
state[0] ^= (rtk1)[8]; \
state[1] ^= (rtk1)[9]; \
state[2] ^= (rtk1)[10]; \
state[3] ^= (rtk1)[11]; \
state[0] ^= (rtk2_3)[8]; \
state[1] ^= (rtk2_3)[9]; \
state[2] ^= (rtk2_3)[10]; \
state[3] ^= (rtk2_3)[11]; \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
inv_mixcolumns_1(state); \
state[0] ^= (rtk1)[4]; \
state[1] ^= (rtk1)[5]; \
state[2] ^= (rtk1)[6]; \
state[3] ^= (rtk1)[7]; \
state[0] ^= (rtk2_3)[4]; \
state[1] ^= (rtk2_3)[5]; \
state[2] ^= (rtk2_3)[6]; \
state[3] ^= (rtk2_3)[7]; \
SWAPMOVE(state[1], state[2], 0x55555555, 0); \
state[3] ^= (state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
inv_mixcolumns_0(state); \
state[0] ^= (rtk1)[0]; \
state[1] ^= (rtk1)[1]; \
state[2] ^= (rtk1)[2]; \
state[3] ^= (rtk1)[3]; \
state[0] ^= (rtk2_3)[0]; \
state[1] ^= (rtk2_3)[1]; \
state[2] ^= (rtk2_3)[2]; \
state[3] ^= (rtk2_3)[3]; \
SWAPMOVE(state[3], state[0], 0x55555555, 0); \
state[1] ^= (state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
SWAPMOVE(state[0], state[3], 0x55555555, 1); \
SWAPMOVE(state[1], state[0], 0x55555555, 1); \
state[1] ^= ~(state[2] | state[3]); \
SWAPMOVE(state[3], state[2], 0x55555555, 1); \
SWAPMOVE(state[2], state[1], 0x55555555, 1); \
state[3] ^= ~(state[0] | state[1]); \
})
#endif // SKINNY128_H_
\ No newline at end of file
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
/******************************************************************************
* Implementation of the SKINNY tweakey schedule to match fixslicing.
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
/******************************************************************************
* The round constants according to the new representation.
******************************************************************************/
u32 rconst_32_bs[224] = {
0x00000004, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffeff, 0x44000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00400000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfefffbff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010410, 0xfffffbef, 0x00000054, 0xffffffaf, 0x00000000, 0x00000040,
0x00000100, 0x00000100, 0x10000140, 0xfffffeff, 0x44000000, 0xfffffeff,
0x04000000, 0x04000000, 0x00100000, 0x00100000, 0x04000001, 0xfbffffff,
0x00140000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x00000000,
0x01401000, 0xfebfffff, 0x01004400, 0xfffffbff, 0x00000000, 0x00000400,
0x00000010, 0x00000010, 0x00010010, 0xffffffff, 0x00000004, 0xffffffaf,
0x00000040, 0x00000040, 0x00000100, 0x00000000, 0x10000140, 0xffffffbf,
0x40000100, 0xfbfffeff, 0x00000000, 0x04000000, 0x00100000, 0x00000000,
0x04100001, 0xffefffff, 0x00440000, 0xffefffff, 0x00000000, 0x00400000,
0x01000000, 0x01000000, 0x00401000, 0xffffffff, 0x00004000, 0xfeffffff,
0x00000400, 0x00000000, 0x00000000, 0x00000000, 0x00010400, 0xfffffbff,
0x00000014, 0xffffffbf, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xffffffff, 0x40000000, 0xfbffffff, 0x00000000, 0x04000000,
0x00100000, 0x00000000, 0x00100001, 0xffefffff, 0x00440000, 0xffafffff,
0x00000000, 0x00400000, 0x01000000, 0x01000000, 0x01401000, 0xffffffff,
0x00004000, 0xfeffffff, 0x00000400, 0x00000400, 0x00000010, 0x00000000,
0x00010400, 0xfffffbff, 0x00000014, 0xffffffaf, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x10000140, 0xfffffeff, 0x44000000, 0xffffffff,
0x00000000, 0x04000000, 0x00100000, 0x00100000, 0x00000001, 0xffefffff,
0x00440000, 0xffafffff, 0x00400000, 0x00000000, 0x00000000, 0x01000000,
0x01401000, 0xffbfffff, 0x01004000, 0xfffffbff, 0x00000400, 0x00000400,
0x00000010, 0x00000000, 0x00010010, 0xfffffbff, 0x00000014, 0xffffffef,
0x00000000, 0x00000040, 0x00000100, 0x00000000, 0x10000040, 0xfffffeff,
0x44000000, 0xfffffeff, 0x00000000, 0x00000000, 0x00000000, 0x00100000,
0x04000001, 0xffffffff, 0x00040000, 0xffffffff, 0x00400000, 0x00000000,
0x00000000, 0x00000000, 0x00001000, 0xfebfffff, 0x01004400, 0xffffffff,
0x00000000, 0x00000000, 0x00000000, 0x00000010, 0x00010000, 0xffffffff,
0x00000004, 0xffffffbf, 0x00000040, 0x00000000, 0x00000000, 0x00000000,
0x10000100, 0xfffffebf, 0x44000100, 0xffffffff, 0x00000000, 0x04000000,
0x00100000, 0x00100000, 0x00000001, 0xffffffff, 0x00040000, 0xffafffff,
0x00400000, 0x00000000, 0x00000000, 0x00000000, 0x01401000, 0xffbfffff,
0x01004000, 0xfffffbff, 0x00000000, 0x00000400, 0x00000010, 0x00000000,
0x00010010, 0xffffffff
};
/******************************************************************************
* Pack the input into the bitsliced representation
* 24 28 56 60 88 92 120 124 | ... | 0 4 32 36 64 68 96 100
* 25 29 57 61 89 93 121 125 | ... | 1 5 33 37 65 69 97 101
* 26 30 58 62 90 94 122 126 | ... | 2 6 34 38 66 70 98 102
* 27 31 59 63 91 95 123 127 | ... | 3 7 35 39 67 71 99 103
******************************************************************************/
void packing(u32* out, const u8* in) {
u32 tmp;
LE_LOAD(out, in);
LE_LOAD(out + 1, in + 8);
LE_LOAD(out + 2, in + 4);
LE_LOAD(out + 3, in + 12);
SWAPMOVE(out[0], out[0], 0x0a0a0a0a, 3);
SWAPMOVE(out[1], out[1], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[2], 0x0a0a0a0a, 3);
SWAPMOVE(out[3], out[3], 0x0a0a0a0a, 3);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[1], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[3], out[0], 0x03030303, 6);
SWAPMOVE(out[1], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[3], out[2], 0x03030303, 4);
SWAPMOVE(out[3], out[1], 0x03030303, 2);
}
/******************************************************************************
* Unpack the input to a byte-wise representation
******************************************************************************/
void unpacking(u8* out, u32 *in) {
u32 tmp;
SWAPMOVE(in[3], in[1], 0x03030303, 2);
SWAPMOVE(in[3], in[2], 0x03030303, 4);
SWAPMOVE(in[1], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[3], in[0], 0x03030303, 6);
SWAPMOVE(in[1], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[0], in[0], 0x0a0a0a0a, 3);
SWAPMOVE(in[1], in[1], 0x0a0a0a0a, 3);
SWAPMOVE(in[2], in[2], 0x0a0a0a0a, 3);
SWAPMOVE(in[3], in[3], 0x0a0a0a0a, 3);
LE_STORE(out, in[0]);
LE_STORE(out + 8, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out + 12, in[3]);
}
/******************************************************************************
* 0 4 1 5
* 1 5 ---> 2 6
* 2 6 3 7
* 3 7 4 0
******************************************************************************/
void lfsr2_bs(u32* tk) {
u32 tmp;
tmp = tk[0] ^ (tk[2] & 0xaaaaaaaa);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[0] = tk[1];
tk[1] = tk[2];
tk[2] = tk[3];
tk[3] = tmp;
}
/******************************************************************************
* 0 4 7 3
* 1 5 ---> 0 4
* 2 6 1 5
* 3 7 2 6
******************************************************************************/
void lfsr3_bs(u32* tk) {
u32 tmp;
tmp = tk[3] ^ ((tk[1] & 0xaaaaaaaa) >> 1);
tmp = ((tmp & 0xaaaaaaaa) >> 1) | ((tmp << 1) & 0xaaaaaaaa);
tk[3] = tk[2];
tk[2] = tk[1];
tk[1] = tk[0];
tk[0] = tmp;
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, twice
******************************************************************************/
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 4 times
******************************************************************************/
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 6 times
******************************************************************************/
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 8 times
******************************************************************************/
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 10 times
******************************************************************************/
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 12 times
******************************************************************************/
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
/******************************************************************************
* Apply the permutation in a bitsliced manner, 14 times
******************************************************************************/
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 4; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00;
tk[i] |= ROR(tmp,14) & 0x00cc0000;
tk[i] |= ROR(tmp,30) & 0xcc000000;
tk[i] |= ROR(tmp,16) & 0x000000ff;
tk[i] |= ROR(tmp,18) & 0x33003300;
}
}
/******************************************************************************
* Precompute all LFSRs on TK2
******************************************************************************/
void precompute_lfsr_tk2(u32* tk, const u8* key, const int rounds) {
u32 tk2[4];
packing(tk2, key);
memcpy(tk, tk2, 16);
for(int i = 0 ; i < rounds; i+=2) {
lfsr2_bs(tk2);
memcpy(tk+i*4+4, tk2, 16);
}
}
/******************************************************************************
* Precompute all LFSRs on TK3
******************************************************************************/
void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
u32 tk3[4];
packing(tk3, key);
tk[0] ^= tk3[0];
tk[1] ^= tk3[1];
tk[2] ^= tk3[2];
tk[3] ^= tk3[3];
for(int i = 0 ; i < rounds; i+=2) {
lfsr3_bs(tk3);
tk[i*4+4] ^= tk3[0];
tk[i*4+5] ^= tk3[1];
tk[i*4+6] ^= tk3[2];
tk[i*4+7] ^= tk3[3];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
void permute_tk(u32* tk, const u8* key, const int rounds) {
u32 test;
u32 tk1[4], tmp[4];
packing(tk1, key);
memcpy(tmp, tk, 16);
tmp[0] ^= tk1[0];
tmp[1] ^= tk1[1];
tmp[2] ^= tk1[2];
tmp[3] ^= tk1[3];
for(int i = 0 ; i < rounds; i += 8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
tk[i*4] = tmp[2] & 0xf0f0f0f0;
tk[i*4+1] = tmp[3] & 0xf0f0f0f0;
tk[i*4+2] = tmp[0] & 0xf0f0f0f0;
tk[i*4+3] = tmp[1] & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+4, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*4+4] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*4+5] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*4+6] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*4+7] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*4+8] = ROR(tmp[2],28) & 0x03030303;
tk[i*4+8] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*4+9] = ROR(tmp[3],28) & 0x03030303;
tk[i*4+9] |= ROR(tmp[3],12) & 0x0c0c0c0c;
tk[i*4+10] = ROR(tmp[0],28) & 0x03030303;
tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*4+11] = ROR(tmp[1],28) & 0x03030303;
tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+12, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
for(int j = 0; j < 4; j++) {
tk[i*4+12+j] = ROR(tmp[j],14) & 0x30303030;
tk[i*4+12+j] |= ROR(tmp[j],6) & 0x0c0c0c0c;
}
tk[i*4+16] = ROR(tmp[2], 16) & 0xf0f0f0f0;
tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0;
tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0;
tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*4+20, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*4+20] = ROR(tmp[0], 10) & 0xc3c3c3c3;
tk[i*4+21] = ROR(tmp[1], 10) & 0xc3c3c3c3;
tk[i*4+22] = ROR(tmp[2], 10) & 0xc3c3c3c3;
tk[i*4+23] = ROR(tmp[3], 10) & 0xc3c3c3c3;
tk[i*4+24] = ROR(tmp[2],12) & 0x03030303;
tk[i*4+24] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*4+25] = ROR(tmp[3],12) & 0x03030303;
tk[i*4+25] |= ROR(tmp[3],28) & 0x0c0c0c0c;
tk[i*4+26] = ROR(tmp[0],12) & 0x03030303;
tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*4+27] = ROR(tmp[1],12) & 0x03030303;
tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*4+28, 16);
XOR_BLOCKS(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
for(int j = 0; j < 4; j++) {
tk[i*4+28+j] = ROR(tmp[j],30) & 0x30303030;
tk[i*4+28+j] |= ROR(tmp[j],22) & 0x0c0c0c0c;
}
if (test && (i+8 < rounds)) { //only if next loop iteration
tk[i*4+32] = tmp[2] & 0xf0f0f0f0;
tk[i*4+33] = tmp[3] & 0xf0f0f0f0;
tk[i*4+34] = tmp[0] & 0xf0f0f0f0;
tk[i*4+35] = tmp[1] & 0xf0f0f0f0;
}
}
}
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst.
******************************************************************************/
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS);
precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS);
precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS);
permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL
for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts
for(int j = 0; j < 4; j++)
rtk[i*4+j] ^= rconst_32_bs[i*4+j];
}
}
/******************************************************************************
* Precompute RTK1.
******************************************************************************/
void precompute_rtk1(u32* rtk1, const u8* tk1) {
memset(rtk1, 0x00, 16*16);
permute_tk(rtk1, tk1, 16);
}
\ No newline at end of file
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef unsigned char u8;
typedef unsigned int u32;
void packing(u32* out, const u8* in);
void unpacking(u8* out, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3);
void precompute_rtk1(u32* rtk1, const u8* tk1);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#endif // TK_SCHEDULE_H_
\ No newline at end of file
#define CRYPTO_KEYBYTES 16
#define CRYPTO_NSECBYTES 0
#define CRYPTO_NPUBBYTES 16
#define CRYPTO_ABYTES 16
#define CRYPTO_NOOVERLAP 1
int crypto_aead_encrypt(
unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k
);
int crypto_aead_decrypt(
unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k
);
\ No newline at end of file
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include <string.h>
#include <stdio.h>
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
******************************************************************************/
static void xor_block(u8 * x, const u8* y) {
for(int i = 0; i < BLOCKBYTES; i++)
x[i] ^= y[i];
}
/******************************************************************************
* Process the associated data. Common to SKINNY-AEAD-M1 encrypt and decrypt
* functions.
******************************************************************************/
static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u64 mlen, const u8* ad, u64 adlen) {
u64 lfsr = 1;
u8 feedback;
u8 tmp[2*BLOCKBYTES];
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
SET_DOMAIN(tmp, 0x02);
while (adlen >= 2*BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x02);
precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES);
skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
adlen -= 2*BLOCKBYTES;
ad += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
if (adlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr);
SET_DOMAIN(tmp + BLOCKBYTES, 0x03); // domain for padding ad
precompute_rtk1(tk->rtk1, tmp, tmp + BLOCKBYTES);
adlen -= BLOCKBYTES;
memset(tmp, 0x00, BLOCKBYTES);
memcpy(tmp, ad + BLOCKBYTES, adlen);
tmp[adlen] ^= 0x80; // padding
skinny128_384_encrypt(tmp + BLOCKBYTES, tmp, ad, tmp, *tk);
xor_block(auth, tmp);
xor_block(auth, tmp + BLOCKBYTES);
} else if (adlen == BLOCKBYTES) {
LE_STR_64(tmp, lfsr);
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_encrypt(auth, c, ad, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_encrypt(auth, auth, ad, ad, *tk);
}
} else if (adlen > 0) {
LE_STR_64(tmp, lfsr);
SET_DOMAIN(tmp, 0x03); // domain for padding ad
memset(tmp + BLOCKBYTES, 0x00, BLOCKBYTES); // padding
memcpy(tmp + BLOCKBYTES, ad, adlen); // padding
tmp[BLOCKBYTES + adlen] ^= 0x80; // padding
if (mlen == 0) { // if tag has *NOT* been calculated yet
precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag
skinny128_384_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk);
} else { // if tag has been calculated yet
precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block
skinny128_384_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk);
}
}
}
/******************************************************************************
* Encryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
const unsigned char *m, unsigned long long mlen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *nsec,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
tweakey tk;
u8 tmp[2*BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
// ----------------- Initialization -----------------
*clen = mlen + TAGBYTES;
precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(c + mlen, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (mlen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for 2nd block
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_encrypt(c, c + BLOCKBYTES, m, m + BLOCKBYTES, tk);
xor_block(c + mlen, m); // sum for tag computation
xor_block(c + mlen, m + BLOCKBYTES); // sum for tag computation
mlen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (mlen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for 2nd block
SET_DOMAIN(tmp + BLOCKBYTES, 0x01); // domain for padding m
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_encrypt(c, auth, m, auth, tk);
xor_block(c + mlen, m);
for(i = 0; i < mlen - BLOCKBYTES; i++) {
c[BLOCKBYTES + i] = auth[i] ^ m[BLOCKBYTES + i];
c[mlen + i] ^= m[BLOCKBYTES + i];
}
c[mlen + i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
m += mlen;
c += mlen;
mlen = 0;
UPDATE_LFSR(lfsr);
} else if (mlen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr); // lfsr for last full block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for tag computation
SET_DOMAIN(tmp + BLOCKBYTES, 0x04); // domain for tag computation
xor_block(c + mlen, m); // sum for tag computation
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_encrypt(c, c + mlen, m, c + mlen, tk);
c += BLOCKBYTES;
} else if (mlen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for tag computation
SET_DOMAIN(tmp + BLOCKBYTES, 0x05); // domain for tag computation
for(i = 0; i < mlen; i++) // sum for tag computation
c[mlen + i] ^= m[i]; // sum for tag computation
c[mlen + i] ^= 0x80; // padding
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_encrypt(auth, c + mlen, auth, c + mlen, tk);
for(i = 0; i < mlen; i++)
c[i] = auth[i] ^ m[i]; // encrypted padded block
c += mlen;
}
if (mlen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr); // lfsr for tag computation
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) { //if all AD can be processed in //
precompute_rtk1(tk.rtk1, tag, tag);
skinny128_384_encrypt(c, c, c, c, tk); // compute the tag
}
}
// ----------------- Process the plaintext -----------------
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, c, tag, &tk, mlen, ad, adlen);
xor_block(c, auth);
// ----------------- Process the associated data -----------------
return 0;
}
/******************************************************************************
* Decryption and authentication using SKINNY-AEAD-M1
******************************************************************************/
int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
unsigned char *nsec,
const unsigned char *c, unsigned long long clen,
const unsigned char *ad, unsigned long long adlen,
const unsigned char *npub,
const unsigned char *k) {
u64 i,lfsr = 1;
u8 feedback;
tweakey tk;
u8 tmp[2*BLOCKBYTES];
u8 sum[BLOCKBYTES], tag[BLOCKBYTES], auth[BLOCKBYTES];
(void)nsec;
if (clen < TAGBYTES)
return -1;
// ----------------- Initialization -----------------
clen -= TAGBYTES;
*mlen = clen;
precompute_rtk2_3(tk.rtk2_3, npub, k, SKINNY128_384_ROUNDS);
memset(tmp, 0x00, 2*BLOCKBYTES);
memset(tag, 0x00, BLOCKBYTES);
memset(auth, 0x00, BLOCKBYTES);
memset(sum, 0x00, BLOCKBYTES);
// ----------------- Initialization -----------------
// ----------------- Process the plaintext -----------------
while (clen >= 2*BLOCKBYTES) { // process 2 blocks in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
UPDATE_LFSR(lfsr);
LE_STR_64(tmp + BLOCKBYTES, lfsr); // lfsr for 2nd block
precompute_rtk1(tk.rtk1, tmp, tmp + BLOCKBYTES);
skinny128_384_decrypt(m, m + BLOCKBYTES, c, c + BLOCKBYTES, tk);
xor_block(sum, m); // sum for tag computation
xor_block(sum, m + BLOCKBYTES); // sum for tag computation
clen -= 2*BLOCKBYTES;
c += 2*BLOCKBYTES;
m += 2*BLOCKBYTES;
UPDATE_LFSR(lfsr);
}
SET_DOMAIN(tag, 0x04); // domain for tag computation
if (clen > BLOCKBYTES) { // pad and process 2 blocs in //
LE_STR_64(tmp, lfsr); // lfsr for 1st block
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_decrypt(m, m, c, c, tk);
xor_block(sum, m);
UPDATE_LFSR(lfsr);
LE_STR_64(tmp, lfsr); // lfsr for 2nd block
SET_DOMAIN(tmp, 0x01); // domain for padding m
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_encrypt(auth, auth, auth, auth, tk);
for(i = 0; i < clen - BLOCKBYTES; i++) {
m[BLOCKBYTES + i] = auth[i] ^ c[BLOCKBYTES + i];
sum[i] ^= m[BLOCKBYTES + i];
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
m += clen;
c += clen;
clen = 0;
UPDATE_LFSR(lfsr);
} else if (clen == BLOCKBYTES) { // last block is full
LE_STR_64(tmp, lfsr); // lfsr for last full block
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_decrypt(m, m, c, c, tk);
xor_block(sum, m); // sum for tag computation
SET_DOMAIN(tag, 0x04); // domain for tag computation
UPDATE_LFSR(lfsr);
c += BLOCKBYTES;
clen = 0;
} else if (clen > 0) { // last block is partial
LE_STR_64(tmp, lfsr); // lfsr for last block
SET_DOMAIN(tmp, 0x01); // domain for padding
precompute_rtk1(tk.rtk1, tmp, tmp);
skinny128_384_encrypt(auth, auth, auth, auth, tk);
for(i = 0; i < clen; i++) {
m[i] = auth[i] ^ c[i]; // encrypted padded block
sum[i] ^= m[i]; // sum for tag computation
}
sum[i] ^= 0x80; // padding
SET_DOMAIN(tag, 0x05); // domain for tag computation
UPDATE_LFSR(lfsr);
m += clen;
c += clen;
clen = 0;
}
if (clen == 0) { // if tag has *NOT* been calculated yet
LE_STR_64(tag, lfsr); // lfsr for tag computation
if((adlen % 32) == 0 || (adlen % 32) > BLOCKBYTES) {
precompute_rtk1(tk.rtk1, tag, tag); //if AD can be processed in //
skinny128_384_encrypt(sum, sum, sum, sum, tk); // compute the tag
}
}
// ----------------- Process the associated data -----------------
skinny_aead_m1_auth(auth, sum, tag, &tk, clen, ad, adlen);
xor_block(sum, auth);
feedback = 0;
for(i = 0; i < TAGBYTES; i++)
feedback |= sum[i] ^ c[i]; // constant-time tag verification
return feedback;
// ----------------- Process the associated data -----------------
}
\ No newline at end of file
/******************************************************************************
* Fixsliced implementation of SKINNY-128-384.
* Two blocks are processed in parallel.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. However, the Sbox
* permutation requires 8 rounds for a synchronization. To limit the impact
* on code size, we compute the permutation every 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0.
****************************************************************************/
void mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
}
}
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 1.
****************************************************************************/
void mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
}
}
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 2.
****************************************************************************/
void mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
}
}
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 3.
****************************************************************************/
void mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
****************************************************************************/
void inv_mixcolumns_0(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],8) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,4);
tmp = ROR(state[i],24) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,30);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
****************************************************************************/
void inv_mixcolumns_1(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,2);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],16) & 0x30303030;
state[i] ^= ROR(tmp,30);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
****************************************************************************/
void inv_mixcolumns_2(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = ROR(state[i],24) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,2);
tmp = ROR(state[i],16) & 0x0c0c0c0c;
state[i] ^= ROR(tmp,28);
tmp = ROR(state[i],8) & 0xc0c0c0c0;
state[i] ^= ROR(tmp,6);
}
}
/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
****************************************************************************/
void inv_mixcolumns_3(u32* state) {
u32 tmp;
for(int i = 0; i < 8; i++) {
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,26);
tmp = state[i] & 0x30303030;
state[i] ^= ROR(tmp,4);
tmp = state[i] & 0x03030303;
state[i] ^= ROR(tmp,30);
}
}
/****************************************************************************
* Adds the tweakey (including the round constants) to the state.
****************************************************************************/
void add_tweakey(u32* state, const u32* rtk1, const u32* rtk2_3) {
state[0] ^= rtk1[0] ^ rtk2_3[0];
state[1] ^= rtk1[1] ^ rtk2_3[1];
state[2] ^= rtk1[2] ^ rtk2_3[2];
state[3] ^= rtk1[3] ^ rtk2_3[3];
state[4] ^= rtk1[4] ^ rtk2_3[4];
state[5] ^= rtk1[5] ^ rtk2_3[5];
state[6] ^= rtk1[6] ^ rtk2_3[6];
state[7] ^= rtk1[7] ^ rtk2_3[7];
}
/****************************************************************************
* Encryption of 2 blocks in parallel using SKINNY-128-384.
* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
****************************************************************************/
void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk) {
u32 state[8];
packing(state, ptext, ptext_bis);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320);
QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352);
QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384);
QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416);
unpacking(ctext, ctext_bis, state);
}
/****************************************************************************
* Decryption of 2 blocks in parallel using SKINNY-128-384.
* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
****************************************************************************/
void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const u8* ctext_bis, const tweakey tk) {
u32 state[8];
packing(state, ctext, ctext_bis);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128);
INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96);
INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64);
INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32);
INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3);
unpacking(ptext, ptext_bis, state);
}
\ No newline at end of file
#ifndef SKINNY128_H_
#define SKINNY128_H_
#include "tk_schedule.h"
void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk);
void skinny128_384_decrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const u8* ptext_bis, const tweakey tk);
#define SKINNY128_128_ROUNDS 40
#define SKINNY128_256_ROUNDS 48
#define SKINNY128_384_ROUNDS 56
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[3] ^= (state[0] | state[1]); \
state[7] ^= (state[4] | state[5]); \
state[1] ^= (state[6] | state[5]); \
state[2] ^= (state[3] & state[7]); \
state[6] ^= (~state[7] | state[4]); \
state[0] ^= (state[2] | ~state[1]); \
state[4] ^= (~state[3] | state[2]); \
state[5] ^= (state[6] & state[0]); \
add_tweakey(state, rtk1, rtk2_3); \
mixcolumns_0(state); \
state[4] ^= (state[2] | state[3]); \
state[5] ^= (state[6] | state[1]); \
state[3] ^= (state[0] | state[1]); \
state[7] ^= (state[4] & state[5]); \
state[0] ^= (~state[5] | state[6]); \
state[2] ^= (state[7] | ~state[3]); \
state[6] ^= (~state[4] | state[7]); \
state[1] ^= (state[0] & state[2]); \
add_tweakey(state, rtk1+8, rtk2_3+8); \
mixcolumns_1(state); \
state[6] ^= (state[7] | state[4]); \
state[1] ^= (state[0] | state[3]); \
state[4] ^= (state[2] | state[3]); \
state[5] ^= (state[6] & state[1]); \
state[2] ^= (~state[1] | state[0]); \
state[7] ^= (state[5] | ~state[4]); \
state[0] ^= (~state[6] | state[5]); \
state[3] ^= (state[2] & state[7]); \
add_tweakey(state, rtk1+16, rtk2_3+16); \
mixcolumns_2(state); \
state[0] ^= (state[5] | state[6]); \
state[3] ^= (state[2] | state[4]); \
state[6] ^= (state[7] | state[4]); \
state[1] ^= (state[0] & state[3]); \
state[7] ^= (~state[3] | state[2]); \
state[5] ^= (state[1] | ~state[6]); \
state[2] ^= (~state[0] | state[1]); \
state[4] ^= (state[7] & state[5]); \
add_tweakey(state, rtk1+24, rtk2_3+24); \
mixcolumns_3(state); \
state[0] ^= state[1]; \
state[1] ^= state[0]; \
state[0] ^= state[1]; \
state[2] ^= state[3]; \
state[3] ^= state[2]; \
state[2] ^= state[3]; \
state[4] ^= state[7]; \
state[7] ^= state[4]; \
state[4] ^= state[7]; \
state[5] ^= state[6]; \
state[6] ^= state[5]; \
state[5] ^= state[6]; \
})
#define INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3) ({ \
state[0] ^= state[1]; \
state[1] ^= state[0]; \
state[0] ^= state[1]; \
state[2] ^= state[3]; \
state[3] ^= state[2]; \
state[2] ^= state[3]; \
state[4] ^= state[7]; \
state[7] ^= state[4]; \
state[4] ^= state[7]; \
state[5] ^= state[6]; \
state[6] ^= state[5]; \
state[5] ^= state[6]; \
inv_mixcolumns_3(state); \
add_tweakey(state, rtk1+24, rtk2_3+24); \
state[4] ^= (state[7] & state[5]); \
state[2] ^= (~state[0] | state[1]); \
state[5] ^= (state[1] | ~state[6]); \
state[7] ^= (~state[3] | state[2]); \
state[1] ^= (state[0] & state[3]); \
state[6] ^= (state[7] | state[4]); \
state[3] ^= (state[2] | state[4]); \
state[0] ^= (state[5] | state[6]); \
inv_mixcolumns_2(state); \
add_tweakey(state, rtk1+16, rtk2_3+16); \
state[3] ^= (state[2] & state[7]); \
state[0] ^= (~state[6] | state[5]); \
state[7] ^= (state[5] | ~state[4]); \
state[2] ^= (~state[1] | state[0]); \
state[5] ^= (state[6] & state[1]); \
state[4] ^= (state[2] | state[3]); \
state[1] ^= (state[0] | state[3]); \
state[6] ^= (state[7] | state[4]); \
inv_mixcolumns_1(state); \
add_tweakey(state, rtk1+8, rtk2_3+8); \
state[1] ^= (state[0] & state[2]); \
state[6] ^= (~state[4] | state[7]); \
state[2] ^= (state[7] | ~state[3]); \
state[0] ^= (~state[5] | state[6]); \
state[7] ^= (state[4] & state[5]); \
state[3] ^= (state[0] | state[1]); \
state[5] ^= (state[6] | state[1]); \
state[4] ^= (state[2] | state[3]); \
inv_mixcolumns_0(state); \
add_tweakey(state, rtk1, rtk2_3); \
state[5] ^= (state[6] & state[0]); \
state[4] ^= (~state[3] | state[2]); \
state[0] ^= (state[2] | ~state[1]); \
state[6] ^= (~state[7] | state[4]); \
state[2] ^= (state[3] & state[7]); \
state[1] ^= (state[6] | state[5]); \
state[7] ^= (state[4] | state[5]); \
state[3] ^= (state[0] | state[1]); \
})
#endif // SKINNY128_H_
\ No newline at end of file
#ifndef SKINNYAEADM1_H_
#define SKINNYAEADM1_H_
#include "skinny128.h"
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
#define TAGBYTES 16
#define KEYBYTES 16
#define BLOCKBYTES 16
#define SET_DOMAIN(ptr, domain) ((ptr)[15] = (domain))
#define UPDATE_LFSR(lfsr) ({ \
feedback = ((lfsr) & (1ULL << 63)) ? 0x1B : 0x00; \
(lfsr) = ((lfsr) << 1) ^ feedback; \
})
#define LE_STR_64(ptr, x) ({ \
(ptr)[0] = (u8)(x); \
(ptr)[1] = (u8)((x) >> 8); \
(ptr)[2] = (u8)((x) >> 16); \
(ptr)[3] = (u8)((x) >> 24); \
(ptr)[4] = (u8)((x) >> 32); \
(ptr)[5] = (u8)((x) >> 40); \
(ptr)[6] = (u8)((x) >> 48); \
(ptr)[7] = (u8)((x) >> 56); \
})
#endif // SKINNYAEADM1_H_
\ No newline at end of file
/*******************************************************************************
* Implementation of the tweakey schedule according to the fixsliced
* representation.
*
* For more details, see the paper at: https://
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
*
* @date May 2020
*******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "tk_schedule.h"
typedef unsigned char u8;
typedef unsigned int u32;
/****************************************************************************
* The round constants according to the fixsliced representation.
****************************************************************************/
u32 rconst_32_bs[448] = {
0xfffffff3, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000300, 0xcffffcff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xffcffffc, 0xffcfffff, 0xffcfffff,
0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff,
0xfcffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000c00,
0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c30, 0xfffcf3cf, 0xffffffff, 0xffffffcf,
0xffffff03, 0xffffff3f, 0x00000000, 0xffffffff,
0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff,
0xfffffcff, 0x00000300, 0xcffffc3f, 0xfffffcff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xf3ffffff, 0x00000300,
0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x0c000000, 0xf3fffffc, 0xffcfffff, 0xffcfffff,
0xffc3ffff, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff,
0xffffffff, 0x03c00000, 0xfc3fcfff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000c00,
0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff33ff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000000, 0xfffcffcf, 0xffffffcf, 0xffffffcf,
0xfffffff3, 0xffffff3f, 0x00000000, 0xffffffff,
0xffffffff, 0x000000f0, 0xffffff3f, 0xffffffff,
0xfffffcff, 0x000000c0, 0xcffffc3f, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x0c000300,
0xf3ffffff, 0x00000000, 0xffffffff, 0x3ffffcff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xf3cffffc, 0xffffffff, 0xffcfffff,
0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00300000, 0xffffffff, 0xffffffff,
0xfcffffff, 0x00000000, 0xff3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
0xffffffff, 0x00000000, 0xffffffff, 0xffff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffff,
0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x000000c0, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000000, 0xcffffcff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x0c000000,
0xf3ffffff, 0x00000000, 0xffffffff, 0x3fffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xffcffffc, 0xffffffff, 0xffcfffff,
0xff33ffff, 0xff3fffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xffffffff, 0xffffffff,
0xfcffffff, 0x00000000, 0xfc3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x03000000,
0xfffff3ff, 0x00000000, 0xffffffff, 0xffff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c00, 0xfffcf3ff, 0xffffffff, 0xffffffcf,
0xffffffc3, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x000000f0, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000300, 0xcffffc3f, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xf3ffffff, 0x00000000, 0xffffffff, 0x33ffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00300000, 0xfffffffc, 0xffcfffff, 0xffcfffff,
0xff33ffff, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff,
0xffffffff, 0x00c00000, 0xfc3fcfff, 0xfcffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xfffff3ff, 0x00000c00,
0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000c00, 0xfffcffcf, 0xffffffff, 0xffffffcf,
0xffffffc3, 0xffffff3f, 0x00000000, 0xffffffff,
0xffffffff, 0x00000030, 0xffffffff, 0xffffffff,
0xfffffcff, 0x00000300, 0xcfffff3f, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000300,
0xffffffff, 0x00000000, 0xffffffff, 0x33ffffff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000000, 0xf3fffffc, 0xffcfffff, 0xffffffff,
0xfff3ffff, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00000000, 0xff3fffff, 0xffffffff,
0xffffffff, 0x03c00000, 0xffffcfff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0x00000000, 0xffffffff, 0xfcff33ff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000000, 0xfffcffff, 0xffffffcf, 0xffffffff,
0xfffffff3, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x000000c0, 0xffffff3f, 0xffffffff,
0xffffffff, 0x000003c0, 0xcffffcff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xf3ffffff, 0x00000000, 0xffffffff, 0x33fffcff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000000, 0xfffffffc, 0xffcfffff, 0xffcfffff,
0xfff3ffff, 0xffffffff, 0x00000000, 0xffffffff,
0xffffffff, 0x00f00000, 0xff3fffff, 0xffffffff,
0xffffffff, 0x00c00000, 0xfc3fcfff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0xffffffff, 0xffffffff, 0xffffffff, 0x00000c00,
0xfffff3ff, 0x00000000, 0xffffffff, 0xfcff3fff,
0xffffffff, 0x00000000, 0xffffffff, 0xffffffff,
0x00000000, 0xfffcffcf, 0xffffffff, 0xffffffcf
};
/****************************************************************************
* Packs 2 input blocks B, B' into the state using a bitsliced representation.
* Once the packing process is complete, the 256-bit state consists of 8
* 32-bit word and the input blocks bit positioning is as follows:
*
* 24 24' 56 56' 88 88' 120 120' | ... | 0 0' 32 32' 64 64' 96 96'
* 25 25' 57 57' 89 89' 121 121' | ... | 1 1' 33 33' 65 65' 97 97'
* 26 26' 58 58' 90 90' 122 122' | ... | 2 2' 34 34' 66 66' 98 98'
* 27 27' 59 59' 91 91' 123 123' | ... | 3 3' 35 35' 67 67' 99 99'
* 28 28' 60 60' 92 92' 124 124' | ... | 4 4' 36 36' 68 68' 100 100'
* 29 29' 61 61' 93 93' 125 125' | ... | 5 5' 37 37' 69 69' 101 101'
* 30 30' 62 62' 94 94' 126 126' | ... | 6 6' 38 38' 70 70' 102 102'
* 31 31' 63 63' 95 95' 127 127' | ... | 7 7' 39 39' 71 71' 103 103'
****************************************************************************/
void packing(u32* out, const u8* block0, const u8* block1) {
u32 tmp;
LE_LOAD(out, block0);
LE_LOAD(out + 1, block1);
LE_LOAD(out + 2, block0 + 4);
LE_LOAD(out + 3, block1 + 4);
LE_LOAD(out + 4, block0 + 8);
LE_LOAD(out + 5, block1 + 8);
LE_LOAD(out + 6, block0 + 12);
LE_LOAD(out + 7, block1 + 12);
SWAPMOVE(out[1], out[0], 0x55555555, 1);
SWAPMOVE(out[3], out[2], 0x55555555, 1);
SWAPMOVE(out[5], out[4], 0x55555555, 1);
SWAPMOVE(out[7], out[6], 0x55555555, 1);
SWAPMOVE(out[2], out[0], 0x30303030, 2);
SWAPMOVE(out[4], out[0], 0x0c0c0c0c, 4);
SWAPMOVE(out[6], out[0], 0x03030303, 6);
SWAPMOVE(out[3], out[1], 0x30303030, 2);
SWAPMOVE(out[5], out[1], 0x0c0c0c0c, 4);
SWAPMOVE(out[7], out[1], 0x03030303, 6);
SWAPMOVE(out[4], out[2], 0x0c0c0c0c, 2);
SWAPMOVE(out[6], out[2], 0x03030303, 4);
SWAPMOVE(out[5], out[3], 0x0c0c0c0c, 2);
SWAPMOVE(out[7], out[3], 0x03030303, 4);
SWAPMOVE(out[6], out[4], 0x03030303, 2);
SWAPMOVE(out[7], out[5], 0x03030303, 2);
}
/****************************************************************************
* Unacks the 256-bit state into the 32-byte output byte array.
* Once the unpacking process is complete, the byte ordering within the output
* array is as follows:
*
* 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
* 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
****************************************************************************/
void unpacking(u8* out, u8* out_bis, u32 *in) {
u32 tmp;
SWAPMOVE(in[6], in[4], 0x03030303, 2);
SWAPMOVE(in[7], in[5], 0x03030303, 2);
SWAPMOVE(in[5], in[3], 0x0c0c0c0c, 2);
SWAPMOVE(in[7], in[3], 0x03030303, 4);
SWAPMOVE(in[4], in[2], 0x0c0c0c0c, 2);
SWAPMOVE(in[6], in[2], 0x03030303, 4);
SWAPMOVE(in[7], in[1], 0x03030303, 6);
SWAPMOVE(in[5], in[1], 0x0c0c0c0c, 4);
SWAPMOVE(in[3], in[1], 0x30303030, 2);
SWAPMOVE(in[6], in[0], 0x03030303, 6);
SWAPMOVE(in[4], in[0], 0x0c0c0c0c, 4);
SWAPMOVE(in[2], in[0], 0x30303030, 2);
SWAPMOVE(in[1], in[0], 0x55555555, 1);
SWAPMOVE(in[3], in[2], 0x55555555, 1);
SWAPMOVE(in[5], in[4], 0x55555555, 1);
SWAPMOVE(in[7], in[6], 0x55555555, 1);
LE_STORE(out, in[0]);
LE_STORE(out_bis, in[1]);
LE_STORE(out + 4, in[2]);
LE_STORE(out_bis + 4, in[3]);
LE_STORE(out + 8, in[4]);
LE_STORE(out_bis + 8, in[5]);
LE_STORE(out + 12, in[6]);
LE_STORE(out_bis + 12, in[7]);
}
//Apply the permutation in a bitsliced manner, twice
void permute_tk_2(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,14) & 0xcc00cc00;
tk[i] |= (tmp & 0x000000ff) << 16;
tk[i] |= (tmp & 0xcc000000)>> 2;
tk[i] |= (tmp & 0x0033cc00) >> 8;
tk[i] |= (tmp & 0x00cc0000) >>18;
}
}
//Apply the permutation in a bitsliced manner, 4 times
void permute_tk_4(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,22) & 0xcc0000cc;
tk[i] |= ROR(tmp,16) & 0x3300cc00;
tk[i] |= ROR(tmp, 24) & 0x00cc3300;
tk[i] |= (tmp & 0x00cc00cc) >> 2;
}
}
//Apply the permutation in a bitsliced manner, 6 times
void permute_tk_6(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,6) & 0xcccc0000;
tk[i] |= ROR(tmp,24) & 0x330000cc;
tk[i] |= ROR(tmp,10) & 0x3333;
tk[i] |= (tmp & 0xcc) << 14;
tk[i] |= (tmp & 0x3300) << 2;
}
}
//Apply the permutation in a bitsliced manner, 8 times
void permute_tk_8(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0xcc000033;
tk[i] |= ROR(tmp,8) & 0x33cc0000;
tk[i] |= ROR(tmp,26) & 0x00333300;
tk[i] |= (tmp & 0x00333300) >> 6;
}
}
//Apply the permutation in a bitsliced manner, 10 times
void permute_tk_10(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc330000;
tk[i] |= ROR(tmp,26) & 0x33000033;
tk[i] |= ROR(tmp,22) & 0x00cccc00;
tk[i] |= (tmp & 0x00330000) >> 14;
tk[i] |= (tmp & 0xcc00) >> 2;
}
}
//Apply the permutation in a bitsliced manner, 12 times
void permute_tk_12(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,8) & 0xcc33;
tk[i] |= ROR(tmp,30) & 0x00cc00cc;
tk[i] |= ROR(tmp,10) & 0x33330000;
tk[i] |= ROR(tmp,16) & 0xcc003300;
}
}
//Apply the permutation in a bitsliced manner, 14 times
void permute_tk_14(u32* tk) {
u32 tmp;
for(int i =0; i < 8; i++) {
tmp = tk[i];
tk[i] = ROR(tmp,24) & 0x0033cc00; //red
tk[i] |= ROR(tmp,14) & 0x00cc0000; //green
tk[i] |= ROR(tmp,30) & 0xcc000000; //blue
tk[i] |= ROR(tmp,16) & 0x000000ff; //yellow
tk[i] |= ROR(tmp,18) & 0x33003300; //purp
}
}
void precompute_lfsr_tk2(u32* tk, const u8* tk2_0,
const u8* tk2_1, const int rounds) {
u32 tmp;
u32 state[8];
packing(state, tk2_0, tk2_1);
memcpy(tk, state, 32);
for(int i = 0 ; i < rounds; i+=2) {
LFSR2(state);
memcpy(tk+i*8+8, state, 32);
}
}
void precompute_lfsr_tk3(u32* tk, const u8* tk3_0,
const u8* tk3_1, const int rounds) {
u32 tmp;
u32 state[8];
packing(state, tk3_0, tk3_1);
for(int i = 0; i < 8; i++)
tk[i] ^= state[i];
for(int i = 0 ; i < rounds; i+=2) {
LFSR3(state);
tk[i*8+8] ^= state[0];
tk[i*8+9] ^= state[1];
tk[i*8+10] ^= state[2];
tk[i*8+11] ^= state[3];
tk[i*8+12] ^= state[4];
tk[i*8+13] ^= state[5];
tk[i*8+14] ^= state[6];
tk[i*8+15] ^= state[7];
}
}
/****************************************************************************
* XOR with TK with TK1 before applying the permutations.
* The key is then rearranged to match the fixsliced representation.
****************************************************************************/
void permute_tk(u32* tk, const u8* tk1_0, const u8* tk1_1, const int rounds) {
u32 test;
u32 tk1[8], tmp[8];
packing(tk1, tk1_0, tk1_1);
memcpy(tmp, tk, 32);
XOR_BLOCK(tmp, tk1);
tk[0] = tmp[6] & 0xf0f0f0f0; //mask to extract rows 1&2 only
tk[1] = tmp[5] & 0xf0f0f0f0;
tk[2] = tmp[0] & 0xf0f0f0f0;
tk[3] = tmp[1] & 0xf0f0f0f0;
tk[4] = tmp[3] & 0xf0f0f0f0;
tk[5] = tmp[7] & 0xf0f0f0f0;
tk[6] = tmp[4] & 0xf0f0f0f0;
tk[7] = tmp[2] & 0xf0f0f0f0;
for(int i = 0 ; i < rounds; i+=8) {
test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P
memcpy(tmp, tk+i*8+8, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_2(tmp); // applies P^2
else
permute_tk_10(tmp); // applies P^10
tk[i*8+8] = ROR(tmp[4],26) & 0xc3c3c3c3; //mask to extract rows 1&2 only
tk[i*8+9] = ROR(tmp[7],26) & 0xc3c3c3c3; //rotation to match fixslicing
tk[i*8+10] = ROR(tmp[6],26) & 0xc3c3c3c3;
tk[i*8+11] = ROR(tmp[5],26) & 0xc3c3c3c3;
tk[i*8+12] = ROR(tmp[1],26) & 0xc3c3c3c3;
tk[i*8+13] = ROR(tmp[2],26) & 0xc3c3c3c3;
tk[i*8+14] = ROR(tmp[3],26) & 0xc3c3c3c3;
tk[i*8+15] = ROR(tmp[0],26) & 0xc3c3c3c3;
tk[i*8+16] = ROR(tmp[3],28) & 0x03030303; //mask to extract rows 1&2 only
tk[i*8+16] |= ROR(tmp[3],12) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+17] = ROR(tmp[2],28) & 0x03030303;
tk[i*8+17] |= ROR(tmp[2],12) & 0x0c0c0c0c;
tk[i*8+18] = ROR(tmp[4],28) & 0x03030303;
tk[i*8+18] |= ROR(tmp[4],12) & 0x0c0c0c0c;
tk[i*8+19] = ROR(tmp[7],28) & 0x03030303;
tk[i*8+19] |= ROR(tmp[7],12) & 0x0c0c0c0c;
tk[i*8+20] = ROR(tmp[5],28) & 0x03030303;
tk[i*8+20] |= ROR(tmp[5],12) & 0x0c0c0c0c;
tk[i*8+21] = ROR(tmp[0],28) & 0x03030303;
tk[i*8+21] |= ROR(tmp[0],12) & 0x0c0c0c0c;
tk[i*8+22] = ROR(tmp[1],28) & 0x03030303;
tk[i*8+22] |= ROR(tmp[1],12) & 0x0c0c0c0c;
tk[i*8+23] = ROR(tmp[6],28) & 0x03030303;
tk[i*8+23] |= ROR(tmp[6],12) & 0x0c0c0c0c;
memcpy(tmp, tk+i*8+24, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_4(tmp); // applies P^4
else
permute_tk_12(tmp); // applies P^12
tk[i*8+24] = ROR(tmp[1],14) & 0x30303030; //mask to extract rows 1&2 only
tk[i*8+24] |= ROR(tmp[1],6) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+25] = ROR(tmp[0],14) & 0x30303030;
tk[i*8+25] |= ROR(tmp[0],6) & 0x0c0c0c0c;
tk[i*8+26] = ROR(tmp[3],14) & 0x30303030;
tk[i*8+26] |= ROR(tmp[3],6) & 0x0c0c0c0c;
tk[i*8+27] = ROR(tmp[2],14) & 0x30303030;
tk[i*8+27] |= ROR(tmp[2],6) & 0x0c0c0c0c;
tk[i*8+28] = ROR(tmp[7],14) & 0x30303030;
tk[i*8+28] |= ROR(tmp[7],6) & 0x0c0c0c0c;
tk[i*8+29] = ROR(tmp[6],14) & 0x30303030;
tk[i*8+29] |= ROR(tmp[6],6) & 0x0c0c0c0c;
tk[i*8+30] = ROR(tmp[5],14) & 0x30303030;
tk[i*8+30] |= ROR(tmp[5],6) & 0x0c0c0c0c;
tk[i*8+31] = ROR(tmp[4],14) & 0x30303030;
tk[i*8+31] |= ROR(tmp[4],6) & 0x0c0c0c0c;
tk[i*8+32] = ROR(tmp[6],16) & 0xf0f0f0f0; //mask to extract rows 1&2 only
tk[i*8+33] = ROR(tmp[5],16) & 0xf0f0f0f0; //rotation to match fixslicing
tk[i*8+34] = ROR(tmp[0],16) & 0xf0f0f0f0;
tk[i*8+35] = ROR(tmp[1],16) & 0xf0f0f0f0;
tk[i*8+36] = ROR(tmp[3],16) & 0xf0f0f0f0;
tk[i*8+37] = ROR(tmp[7],16) & 0xf0f0f0f0;
tk[i*8+38] = ROR(tmp[4],16) & 0xf0f0f0f0;
tk[i*8+39] = ROR(tmp[2],16) & 0xf0f0f0f0;
memcpy(tmp, tk+i*8+40, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_6(tmp); // applies P^6
else
permute_tk_14(tmp); // applies P^14
tk[i*8+40] = ROR(tmp[4],10) & 0xc3c3c3c3; //mask to extract rows 1&2 only
tk[i*8+41] = ROR(tmp[7],10) & 0xc3c3c3c3; //rotation to match fixslicing
tk[i*8+42] = ROR(tmp[6],10) & 0xc3c3c3c3;
tk[i*8+43] = ROR(tmp[5],10) & 0xc3c3c3c3;
tk[i*8+44] = ROR(tmp[1],10) & 0xc3c3c3c3;
tk[i*8+45] = ROR(tmp[2],10) & 0xc3c3c3c3;
tk[i*8+46] = ROR(tmp[3],10) & 0xc3c3c3c3;
tk[i*8+47] = ROR(tmp[0],10) & 0xc3c3c3c3;
tk[i*8+48] = ROR(tmp[3],12) & 0x03030303; //mask to extract rows 1&2 only
tk[i*8+48] |= ROR(tmp[3],28) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+49] = ROR(tmp[2],12) & 0x03030303;
tk[i*8+49] |= ROR(tmp[2],28) & 0x0c0c0c0c;
tk[i*8+50] = ROR(tmp[4],12) & 0x03030303;
tk[i*8+50] |= ROR(tmp[4],28) & 0x0c0c0c0c;
tk[i*8+51] = ROR(tmp[7],12) & 0x03030303;
tk[i*8+51] |= ROR(tmp[7],28) & 0x0c0c0c0c;
tk[i*8+52] = ROR(tmp[5],12) & 0x03030303;
tk[i*8+52] |= ROR(tmp[5],28) & 0x0c0c0c0c;
tk[i*8+53] = ROR(tmp[0],12) & 0x03030303;
tk[i*8+53] |= ROR(tmp[0],28) & 0x0c0c0c0c;
tk[i*8+54] = ROR(tmp[1],12) & 0x03030303;
tk[i*8+54] |= ROR(tmp[1],28) & 0x0c0c0c0c;
tk[i*8+55] = ROR(tmp[6],12) & 0x03030303;
tk[i*8+55] |= ROR(tmp[6],28) & 0x0c0c0c0c;
memcpy(tmp, tk+i*8+56, 32);
XOR_BLOCK(tmp, tk1);
if (test)
permute_tk_8(tmp); // applies P^8
tk[i*8+56] = ROR(tmp[1],30) & 0x30303030; //mask to extract rows 1&2 only
tk[i*8+56] |= ROR(tmp[1],22) & 0x0c0c0c0c; //rotation to match fixslicing
tk[i*8+57] = ROR(tmp[0],30) & 0x30303030;
tk[i*8+57] |= ROR(tmp[0],22) & 0x0c0c0c0c;
tk[i*8+58] = ROR(tmp[3],30) & 0x30303030;
tk[i*8+58] |= ROR(tmp[3],22) & 0x0c0c0c0c;
tk[i*8+59] = ROR(tmp[2],30) & 0x30303030;
tk[i*8+59] |= ROR(tmp[2],22) & 0x0c0c0c0c;
tk[i*8+60] = ROR(tmp[7],30) & 0x30303030;
tk[i*8+60] |= ROR(tmp[7],22) & 0x0c0c0c0c;
tk[i*8+61] = ROR(tmp[6],30) & 0x30303030;
tk[i*8+61] |= ROR(tmp[6],22) & 0x0c0c0c0c;
tk[i*8+62] = ROR(tmp[5],30) & 0x30303030;
tk[i*8+62] |= ROR(tmp[5],22) & 0x0c0c0c0c;
tk[i*8+63] = ROR(tmp[4],30) & 0x30303030;
tk[i*8+63] |= ROR(tmp[4],22) & 0x0c0c0c0c;
//if (test && (i+8 < rounds)) { //only if next loop iteration
if (i+8 < rounds) { //only if next loop iteration
tk[i*8+64] = tmp[6] & 0xf0f0f0f0; //mask to extract rows 1&2 only
tk[i*8+65] = tmp[5] & 0xf0f0f0f0;
tk[i*8+66] = tmp[0] & 0xf0f0f0f0;
tk[i*8+67] = tmp[1] & 0xf0f0f0f0;
tk[i*8+68] = tmp[3] & 0xf0f0f0f0;
tk[i*8+69] = tmp[7] & 0xf0f0f0f0;
tk[i*8+70] = tmp[4] & 0xf0f0f0f0;
tk[i*8+71] = tmp[2] & 0xf0f0f0f0;
}
}
}
//Precompute LFSR2(TK2) ^ LFSR3(TK3) ^ rconst
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3, int rounds) {
memset(rtk, 0x00, 32*rounds);
precompute_lfsr_tk2(rtk, tk2, tk2, rounds);
precompute_lfsr_tk3(rtk, tk3, tk3, rounds);
permute_tk(rtk, (u8*)(rtk+16), (u8*)(rtk+16), rounds); // rtk+16 is NULL
for(int i = 0; i < rounds; i++) { // add rconsts
for(int j = 0; j < 8; j++)
rtk[i*8+j] ^= rconst_32_bs[i*8+j];
}
}
//Precompute TK1
void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis) {
memset(rtk1, 0x00, 32*16);
permute_tk(rtk1, tk1, tk1_bis, 16);
}
#ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
typedef unsigned char u8;
typedef unsigned int u32;
typedef struct {
u32 rtk1[8*16];
u32 rtk2_3[8*56];
} tweakey;
void packing(u32* out, const u8* block0, const u8* block1);
void unpacking(u8* out, u8* out_bis, u32 *in);
void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3, int rounds);
void precompute_rtk1(u32* rtk1, const u8* tk1, const u8* tk1_bis);
#define LFSR2(tk) ({ \
tmp = (tk)[0] ^ (tk)[2]; \
(tk)[0] = (tk)[1]; \
(tk)[1] = (tk)[2]; \
(tk)[2] = (tk)[3]; \
(tk)[3] = (tk)[4]; \
(tk)[4] = (tk)[5]; \
(tk)[5] = (tk)[6]; \
(tk)[6] = (tk)[7]; \
(tk)[7] = tmp; \
})
#define LFSR3(tk) ({ \
tmp = (tk)[7] ^ (tk)[1]; \
(tk)[7] = (tk)[6]; \
(tk)[6] = (tk)[5]; \
(tk)[5] = (tk)[4]; \
(tk)[4] = (tk)[3]; \
(tk)[3] = (tk)[2]; \
(tk)[2] = (tk)[1]; \
(tk)[1] = (tk)[0]; \
(tk)[0] = tmp; \
})
#define XOR_BLOCK(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
(x)[4] ^= (y)[4]; \
(x)[5] ^= (y)[5]; \
(x)[6] ^= (y)[6]; \
(x)[7] ^= (y)[7]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
b ^= tmp; \
a ^= (tmp << n); \
})
#define LE_LOAD(x, y) \
*(x) = (((u32)(y)[3] << 24) | \
((u32)(y)[2] << 16) | \
((u32)(y)[1] << 8) | \
(y)[0]);
#define LE_STORE(x, y) \
(x)[0] = (y) & 0xff; \
(x)[1] = ((y) >> 8) & 0xff; \
(x)[2] = ((y) >> 16) & 0xff; \
(x)[3] = (y) >> 24;
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#endif // TK_SCHEDULE_BS_H_
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment