sneik_f512_avr_small.S 6.06 KB
Newer Older
lwc-tester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
//  sneik_f512_avr_small.S
//  2019-02-14  Markku-Juhani O. Saarinen <mjos@pqshield.com>
//  Copyright (C) 2019, PQShield Ltd. Please see LICENSE.

//  Smaller 8-bit Atmel AVR implementation for the SNEIK f512 function.

//  Prototype:
//      void sneik_f512(void *s, uint8_t dom, uint8_t rounds);

__zero_reg__ = 1

        .text

// load 4 bytes to 4 registers via y

.macro  ldy4    rr
        ld      \rr,        y+
        ld      \rr + 1,    y+
        ld      \rr + 2,    y+
        ld      \rr + 3,    y+
        .endm

// load 4 bytes to 4 registers via z

.macro  ldzv4   rr, rv
        ldd     \rr,        z + (4 * \rv)
        ldd     \rr + 1,    z + (4 * \rv + 1)
        ldd     \rr + 2,    z + (4 * \rv + 2)
        ldd     \rr + 3,    z + (4 * \rv + 3)
        .endm

// mixing function. macro loads v1

.macro  mix_f   t0, t1, t2, t3, tmp

        //      inner feedback left
        add     \t0,        \t3         //  t0 += t3;
        adc     \t0 + 1,    \t3 + 1
        adc     \t0 + 2,    \t3 + 2
        adc     \t0 + 3,    \t3 + 3

        //      p = x^25 + x^24 + 1
        mov     \tmp + 3,   \t0         //  tmp = t0 <<< 24
        mov     \tmp,       \t0 + 1
        mov     \tmp + 1,   \t0 + 2
        mov     \tmp + 2,   \t0 + 3
        eor     \t0,        \tmp        //  t0 = t0 ^ (t0 <<< 24)
        eor     \t0 + 1,    \tmp + 1
        eor     \t0 + 2,    \tmp + 2
        eor     \t0 + 3,    \tmp + 3
        lsl     \tmp                    //  tmp = t0 <<< 25
        rol     \tmp + 1
        rol     \tmp + 2
        rol     \tmp + 3
        adc     \tmp,   __zero_reg__
        eor     \t0,        \tmp        //  t0 ^ (t0 <<< 24) ^ (t0 <<< 25)
        eor     \t0 + 1,    \tmp + 1
        eor     \t0 + 2,    \tmp + 2
        eor     \t0 + 3,    \tmp + 3

        //      outer feedback
        eor     \t0,        \t2         //  t0 ^= t2;
        eor     \t0 + 1,    \t2 + 1
        eor     \t0 + 2,    \t2 + 2
        eor     \t0 + 3,    \t2 + 3

        ldy4    \t2                     //  vec[(pos + 2) & 0xF];

        add     \t0,        \t2         //  t0 += t2;
        adc     \t0 + 1,    \t2 + 1
        adc     \t0 + 2,    \t2 + 2
        adc     \t0 + 3,    \t2 + 3

        //      p = x^17 + x^9 + 1
        mov     \tmp + 1,   \t0         //  tmp = t0 <<< 8
        mov     \tmp + 2,   \t0 + 1
        mov     \tmp + 3,   \t0 + 2
        mov     \tmp,       \t0 + 3
        eor     \tmp + 2,   \t0         //  tmp = (t0 <<< 8) ^ (t0 <<< 16)
        eor     \tmp + 3,   \t0 + 1
        eor     \tmp,       \t0 + 2
        eor     \tmp + 1,   \t0 + 3
        lsl     \tmp                    //  tmp = (t0 <<< 9) ^ (t0 <<< 17)
        rol     \tmp + 1
        rol     \tmp + 2
        rol     \tmp + 3
        adc     \tmp,   __zero_reg__
        eor     \t0,        \tmp        //  t0 ^ (t0 <<< 9) ^ (t0 <<< 17)
        eor     \t0 + 1,    \tmp + 1
        eor     \t0 + 2,    \tmp + 2
        eor     \t0 + 3,    \tmp + 3

        //      inner feedback right
        eor     \t0,        \t1         //  t0 ^= t1;
        eor     \t0 + 1,    \t1 + 1
        eor     \t0 + 2,    \t1 + 2
        eor     \t0 + 3,    \t1 + 3

        //      store v[0]
        st      z+, \t0
        st      z+, \t0 + 1
        st      z+, \t0 + 2
        st      z+, \t0 + 3
        .endm

        //  aliases for 32-bit register sets
        W0      = 4                     //  "W0" is ( r4,  r5,  r6,  r7  )
        W1      = 8                     //  "W1" is ( r8,  r9,  r10, r11 )
        W2      = 12                    //  "W2" is ( r12, r13, r14, r15 )
        W3      = 16                    //  "W3" is ( r16, r17, r18, r19 )
        WT      = 20                    //  "WT" is ( r20, r21, r22, r23 )

.mixf4: mix_f   W0, W1, W2, W3, WT
        mix_f   W1, W2, W3, W0, WT
        sub     yl, r25                 //  Optionally wrap y back to start
        sbci    yh, 0
        mix_f   W2, W3, W0, W1, WT
        mix_f   W3, W0, W1, W2, WT
        ret

//  The C call convention with AVR is that:
//  R2  - R17, R28, R29 are call-saved
//  R18 - R27, R30, R31 are call-globbered

//  Prototype:
//  void sneik_f512(void *s, uint8_t r);

.global sneik_f512
        .type   sneik_f512, @function

sneik_f512:
        push    r3
        push    r4
        push    r5
        push    r6
        push    r7
        push    r8
        push    r9
        push    r10
        push    r11
        push    r12
        push    r13
        push    r14
        push    r15
        push    r16
        push    r17

        push    r28
        push    r29

        movw    z,      r24             //  state pointer   (arg 0 = r25:r24)
        movw    y,      r24
        mov     r24,    r22             //  domain          (arg 1 = 23:22)
        mov     r3,     r20             //  round count     (arg 2 = r21:r20

        ldi     xl,     lo8(rc)         //  round constant table
        ldi     xh,     hi8(rc)

        ldzv4   W2, 14                  //  W2 = v[14]
        ldzv4   W3, 15                  //  W3 = v[15]

        ldy4    W0
        ldy4    W1

.round:
        ld      WT, x+                  //  round constant
        eor     W0, WT
        eor     W1, r24                 //  domain

        ldi     r25, 0                  //  no wrapup on y
        rcall   .mixf4
        rcall   .mixf4
        rcall   .mixf4
        ldi     r25, 64                 //  wrap y back up
        rcall   .mixf4

        subi    zl, 64                  //  wrap z back up
        sbci    zh, 0

        dec     r3                      //  round count (arg 2 = r21:r20)
        brne    .round

        pop     r29
        pop     r28

        pop     r17
        pop     r16
        pop     r15
        pop     r14
        pop     r13
        pop     r12
        pop     r11
        pop     r10
        pop     r9
        pop     r8
        pop     r7
        pop     r6
        pop     r5
        pop     r4
        pop     r3

        ret

        // round constants
        .section    .rodata
rc:     .byte   0xEF, 0xE0, 0xD9, 0xD6, 0xBA, 0xB5, 0x8C, 0x83
        .byte   0x10, 0x1F, 0x26, 0x29, 0x45, 0x4A, 0x73, 0x7C

.global __do_copy_data
.global __do_clear_bss