.syntax unified
.cpu cortex-m3
// sparkle256_arm.asm: ARM Asm implementation of the SPARKLE256 permutation. //
// This file is part of the SPARKLE submission to NIST's LW Crypto Project. //
// Version 1.0.1 (2019-06-29), see for updates. //
// Authors: The SPARKLE Group (C. Beierle, A. Biryukov, L. Cardoso dos //
// Santos, J. Groszschaedl, L. Perrin, A. Udovenko, V. Velichkov, Q. Wang). //
// License: GPLv3 (see LICENSE file), other licenses available upon request. //
// Copyright (C) 2019 University of Luxembourg . //
// ------------------------------------------------------------------------- //
// This program is free software: you can redistribute it and/or modify it //
// under the terms of the GNU General Public License as published by the //
// Free Software Foundation, either version 3 of the License, or (at your //
// option) any later version. This program is distributed in the hope that //
// it will be useful, but WITHOUT ANY WARRANTY; without even the implied //
// GNU General Public License for more details. You should have received a //
// copy of the GNU General Public License along with this program. If not, //
// see . //
/* Register names and constants */
// register sta holds the start address of array
#define sta r0
// register scnt holds the step counter (for loop termination)
#define scnt r0
// register ns holds the parameter , i.e. the number of steps
#define ns r1
// register rca holds the start address of array
#define rca r2
// register c0w holds one word of the array
#define c0w r3
// register c1w holds another word of the array
#define c1w r4
// register x0w holds the first word of the array
#define x0w r5
// register y0w holds the second word of the array
#define y0w r6
// register x1w holds the third word of the array
#define x1w r7
// register y1w holds the fourth word of the array
#define y1w r8
// register x2w holds the fifth word of the array
#define x2w r9
// register y2w holds the sixth word of the array
#define y2w r10
// register x3w holds the seventh word of the array
#define x3w fp
// register y3w holds the eighth word of the array
#define y3w ip
// register tmpx holds a temporary value
#define tmpx r3
// register tmpy holds another temorary value
#define tmpy r4
/* Round Constants */
.word 0xB7E15162
.word 0xBF715880
.word 0x38B4DA56
.word 0x324E7738
.word 0xBB1185EB
.word 0x4F7C7B57
.word 0xCFBFA1C8
.word 0xC2B3293D
/* MACROS */
.macro PROLOGUE_256
push {r4-r12}
ldr rca, =RCON
ldm.w sta, {x0w-y3w}
push {sta}
.macro EPILOGUE_256
pop {sta}
stm.w sta, {x0w-y3w}
pop {r4-r12}
bx lr
.macro ADD_STEP_CNT_256
and c0w, scnt, #7
ldr c0w, [rca, c0w, lsl #2]
eor y0w, y0w, c0w
eor y1w, y1w, scnt
.macro ARX_BOX xi:req, yi:req, ci:req
add \xi, \xi, \yi, ror #31
eor \yi, \yi, \xi, ror #24
eor \xi, \xi, \ci
add \xi, \xi, \yi, ror #17
eor \yi, \yi, \xi, ror #17
eor \xi, \xi, \ci
add \xi, \xi, \yi
eor \yi, \yi, \xi, ror #31
eor \xi, \xi, \ci
add \xi, \xi, \yi, ror #24
eor \yi, \yi, \xi, ror #16
eor \xi, \xi, \ci
.macro ARXBOX_LAYER_256
ldmia rca!, {c0w, c1w}
ARX_BOX x0w, y0w, c0w
ARX_BOX x1w, y1w, c1w
ldmia rca!, {c0w, c1w}
ARX_BOX x2w, y2w, c0w
ARX_BOX x3w, y3w, c1w
sub rca, rca, #16
.macro LINEAR_LAYER_256
// First part of Feistel round: tmpx and tmpy are computed and XORED to the
// y-words and x-words of the right-side branches (i.e. to y[2], y[3] and
// to x[2], x[3]). Note that y[3] and x[3] are stored in register tmpx and
// tmpy (and not in register y3w and x3w) to reduce the execution time of
// the subsequent branch permutation.
eor tmpx, x0w, x1w
eor tmpx, tmpx, tmpx, lsl #16
eor y2w, y2w, tmpx, ror #16
eor tmpx, y3w, tmpx, ror #16
eor tmpy, y0w, y1w
eor tmpy, tmpy, tmpy, lsl #16
eor x2w, x2w, tmpy, ror #16
eor tmpy, x3w, tmpy, ror #16
// Branch permutation: 1-branch left-rotation of the right-side branches
// along with a swap of the left and right branches (via register writes).
// Also combined with the branch permutation is the second Feistel part,
// in which the left-side branches are XORed with the result of the first
// Feistel part.
mov y3w, y1w
eor y1w, y2w, y0w
mov y2w, y0w
eor y0w, tmpx, y3w
mov x3w, x1w
eor x1w, x2w, x0w
mov x2w, x0w
eor x0w, tmpy, x3w
//Function prototype:
// -------------------
// void sparkle256/384/512_arm(uint32_t *state, int ns)
// Parameters:
// -----------
// state: pointer to an uint32-array containing 8/12/16 state words
// ns: number of steps
// Return value:
// -------------
// None
.align 1
.p2align 2,,3
.fpu softvfp
.global sparkle256_arm
.type sparkle256_arm, %function
PROLOGUE_256 // push callee-saved registers
mov scnt, #0 // clear step-counter
ADD_STEP_CNT_256 // macro to add step-counter to state
ARXBOX_LAYER_256 // macro for the arxbox layer
LINEAR_LAYER_256 // macro for the linear layer
add scnt, #1 // increment step-counter
teq scnt, ns // test whether step-counter equals ns
bne .L1 // if not then jump back to start of loop
EPILOGUE_256 // pop callee-saved registers