sneik_f512_armv7_fast.S 2.44 KB
Newer Older
lwc-tester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
//  sneik_f512_armv7_fast.S
//  2019-02-06  Markku-Juhani O. Saarinen <mjos@pqshield.com>
//  Copyright (C) 2019, PQShield Ltd. Please see LICENSE.

//  SNEIK f512 implementation for ARMv7 (including ARMv7-m), fully unrolled

//  C prototype: void sneik_f512(void *state, uint8_t dom, uint8_t rounds);

        .text
        .global sneik_f512
        .syntax unified
        .type   sneik_f512, %function

//      mixing function for 4 rotating registers (r3 is temporary)

.macro  mix_f   pos, t0, t1, t2, t3
        add     \t0, \t0, \t3               // t0 += t3;
        eor     r3,  \t0, \t0, ror #8       // t0 = t0 ^ ROR32(t0, 8)
        eor     \t0, r3,  \t0, ror #7       //         ^ ROR32(t0, 7);
        eor     \t0, \t0, \t2               // t0 ^= t2;
        ldr     \t2, [r0, #4*((\pos + 2) % 16)] // t2 = vec[(pos + 2) & 0xF];
        add     \t0, \t0, \t2               // t0 += t2;
        eor     r3,  \t0, \t0, ror #8       // t0 = t0 ^ ROR32(t0, 15)
        eor     \t0, \t0, r3,  ror #15      //         ^ ROR32(t0, 23);
        eor     \t0, \t0, \t1               // t0 ^= t1
        str     \t0, [r0, #4*\pos]          // vec[pos] = t0;
        .endm

sneik_f512:

        push    {r4, r5, r6, r7}
        ldr     ip, .rcptr                  // table of round constants
        ldr     r4, [r0]
        ldr     r5, [r0, #4*1]
        ldr     r6, [r0, #4*14]
        ldr     r7, [r0, #4*15]

.round:
        ldrb    r3, [ip], #1                // round constant
        eor     r4, r4, r3
        eor     r5, r5, r1                  // domain
        mix_f   0,  r4, r5, r6, r7
        mix_f   1,  r5, r6, r7, r4
        mix_f   2,  r6, r7, r4, r5
        mix_f   3,  r7, r4, r5, r6
        mix_f   4,  r4, r5, r6, r7
        mix_f   5,  r5, r6, r7, r4
        mix_f   6,  r6, r7, r4, r5
        mix_f   7,  r7, r4, r5, r6
        mix_f   8,  r4, r5, r6, r7
        mix_f   9,  r5, r6, r7, r4
        mix_f   10, r6, r7, r4, r5
        mix_f   11, r7, r4, r5, r6
        mix_f   12, r4, r5, r6, r7
        mix_f   13, r5, r6, r7, r4
        mix_f   14, r6, r7, r4, r5
        mix_f   15, r7, r4, r5, r6

        subs    r2, r2, #1                  // loop
        bne     .round

        pop     {r4, r5, r6, r7}
        bx      lr

        .align  2
.rcptr: .word   .rc

        .size       sneik_f512, .-sneik_f512
        .section    .rodata

.rc:    .byte   0xEF, 0xE0, 0xD9, 0xD6, 0xBA, 0xB5, 0x8C, 0x83
        .byte   0x10, 0x1F, 0x26, 0x29, 0x45, 0x4A, 0x73, 0x7C