///////////////////////////////////////////////////////////////////////////////
// sparkleARM.c: Assembler implementation for ARM of the SPARKLE permutation.//
// This file is part of the SPARKLE submission to NIST's LW Crypto Project. //
// Version 0.2.0 (2019-03-28), see for updates. //
// Authors: The SPARKLE Group (C. Beierle, A. Biryukov, L. Cardoso dos //
// Santos, J. Groszschaedl, L. Perrin, A. Udovenko, V. Velichkov, Q. Wang). //
// License: GPLv3 (see LICENSE file), other licenses available upon request. //
// Copyright (C) 2019 University of Luxembourg . //
// ------------------------------------------------------------------------- //
// This program is free software: you can redistribute it and/or modify it //
// under the terms of the GNU General Public License as published by the //
// Free Software Foundation, either version 3 of the License, or (at your //
// option) any later version. This program is distributed in the hope that //
// it will be useful, but WITHOUT ANY WARRANTY; without even the implied //
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //
// GNU General Public License for more details. You should have received a //
// copy of the GNU General Public License along with this program. If not, //
// see . //
///////////////////////////////////////////////////////////////////////////////
#include
#include "sparkle_ref.h"
#define ROT(x, n) (((x) >> (n)) | ((x) << (32-(n))))
#define ELL(x) (ROT(((x) ^ ((x) << 16)), 16))
#define ELLASM(X) \
__asm__ __volatile__( \
"EOR %[x], %[x], %[x], LSL #16 \n\t" \
"ROR %[x], #16 \n\t" \
: [x] "+r" (X) \
: \
)
static const uint32_t RCON[MAX_BRANCHES] = { \
0xB7E15162, 0xBF715880, 0x38B4DA56, 0x324E7738, \
0xBB1185EB, 0x4F7C7B57, 0xCFBFA1C8, 0xC2B3293D \
};
void ARXBOXasm(uint32_t *x, uint32_t *y, const uint32_t c){
__asm__ __volatile__ (
"ADD %[x], %[x], %[y], ror #31 \n\t"
"EOR %[y], %[y], %[x], ror #24 \n\t"
"EOR %[x], %[c] \n\t"
"ADD %[x], %[x], %[y], ror #17 \n\t"
"EOR %[y], %[y], %[x], ror #17 \n\t"
"EOR %[x], %[c] \n\t"
"ADD %[x], %[y] \n\t"
"EOR %[y], %[y], %[x], ror #31 \n\t"
"EOR %[x], %[c] \n\t"
"ADD %[x], %[x], %[y], ror #24 \n\t"
"EOR %[y], %[y], %[x], ror #16 \n\t"
"EOR %[x], %[c] \n\t"
: [x] "+r" (*x), [y] "+r" (*y)
: [c] "r" (c)
);
}
void ARXBoxfullasm(uint32_t *state, int nb, int ns){
// for(int j = 0; j < 2*nb; j += 2) {
// ARXBOXasm(&state[j], &state[j+1], RCON[j>>1]);
// }
__asm__ __volatile__(
//save non-scratch registers
"push {r4, r5, r6, r7, r8, r9} \n\t"
"1: \n\t" //loop entry point
"cmp %[nb], #0 \n\t"
"ble 2f \n\t" //if nb is zero, then jump to ending
//load 4 words and 2 constants
"ldmia.w %[state], {r4, r5, r6, r7} \n\t"
"ldmia.w %[rcon]!, {r8, r9} \n\t" //loads, and move pointer ahead 4 bytes ahead
#ifndef INTERLEAVED_ARXBOX
//apply arxbox to 2 branches
"ADD r4, r4, r5, ror #31 \n\t"
"EOR r5, r5, r4, ror #24 \n\t"
"EOR r4, r8 \n\t"
"ADD r4, r4, r5, ror #17 \n\t"
"EOR r5, r5, r4, ror #17 \n\t"
"EOR r4, r8 \n\t"
"ADD r4, r5 \n\t"
"EOR r5, r5, r4, ror #31 \n\t"
"EOR r4, r8 \n\t"
"ADD r4, r4, r5, ror #24 \n\t"
"EOR r5, r5, r4, ror #16 \n\t"
"EOR r4, r8 \n\t"
//-----------------------------------
"ADD r6, r6, r7, ror #31 \n\t"
"EOR r7, r7, r6, ror #24 \n\t"
"EOR r6, r9 \n\t"
"ADD r6, r6, r7, ror #17 \n\t"
"EOR r7, r7, r6, ror #17 \n\t"
"EOR r6, r9 \n\t"
"ADD r6, r7 \n\t"
"EOR r7, r7, r6, ror #31 \n\t"
"EOR r6, r9 \n\t"
"ADD r6, r6, r7, ror #24 \n\t"
"EOR r7, r7, r6, ror #16 \n\t"
"EOR r6, r9 \n\t"
#else
//apply arxbox to 2 branches, interweaving for pipelining. Only good for Cortex M4
"ADD r4, r4, r5, ror #31 \n\t"
"ADD r6, r6, r7, ror #31 \n\t"
"EOR r5, r5, r4, ror #24 \n\t"
"EOR r7, r7, r6, ror #24 \n\t"
"EOR r4, r8 \n\t"
"EOR r6, r9 \n\t"
"ADD r4, r4, r5, ror #17 \n\t"
"ADD r6, r6, r7, ror #17 \n\t"
"EOR r5, r5, r4, ror #17 \n\t"
"EOR r7, r7, r6, ror #17 \n\t"
"EOR r4, r8 \n\t"
"EOR r6, r9 \n\t"
"ADD r4, r5 \n\t"
"ADD r6, r7 \n\t"
"EOR r5, r5, r4, ror #31 \n\t"
"EOR r7, r7, r6, ror #31 \n\t"
"EOR r4, r8 \n\t"
"EOR r6, r9 \n\t"
"ADD r4, r4, r5, ror #24 \n\t"
"ADD r6, r6, r7, ror #24 \n\t"
"EOR r5, r5, r4, ror #16 \n\t"
"EOR r7, r7, r6, ror #16 \n\t"
"EOR r4, r8 \n\t"
"EOR r6, r9 \n\t"
#endif
//store 4 words
"stmia %[state]!, {r4, r5, r6, r7} \n\t" //store with writeback to [state]
"sub %[nb], #2 \n\t" //update nb index
"b 1b \n\t" //loop back
//recover non-scratch registers
"2: \n\t"
"pop {r4, r5, r6, r7, r8, r9} \n\t"
:[state] "+r" (state), [nb] "+r" (nb)
:[rcon] "r" (RCON)
);
}
void sparkle_optARM(uint32_t *state, int nb, int ns)
{
int i, j; // Step and branch counter
uint32_t tmpx, tmpy, x0, y0;
for(i = 0; i < ns; i ++) {
// Add step counter
state[1] ^= RCON[i%MAX_BRANCHES];
state[3] ^= i;
// ARXBox layer
ARXBoxfullasm(state, nb, ns);
// Linear layer
//feistel round
tmpx = x0 = state[0];
tmpy = y0 = state[1];
for(j = 2; j < nb; j += 2) {
tmpx ^= state[j];
tmpy ^= state[j+1];
}
tmpx = ELL(tmpx);
tmpy = ELL(tmpy);
//branch rotation
for (j = 2; j < nb; j += 2) {
state[j-2] = state[j+nb] ^ state[j] ^ tmpy;
state[j+nb] = state[j];
state[j-1] = state[j+nb+1] ^ state[j+1] ^ tmpx;
state[j+nb+1] = state[j+1];
}
state[nb-2] = state[nb] ^ x0 ^ tmpy;
state[nb] = x0;
state[nb-1] = state[nb+1] ^ y0 ^ tmpx;
state[nb+1] = y0;
}
}