knot384.h 5.88 KB
Newer Older
包珍珍 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
;
; **********************************************
; * KNOT: a family of bit-slice lightweight    *
; *       authenticated encryption algorithms  *
; *       and hash functions                   *
; *                                            *
; * Assembly implementation for 8-bit AVR CPU  *
; * Version 1.1 2020 by KNOT Team              *
; **********************************************
;

; an intentionally arrangement of registers to facilitate movw
#define x20  r0
#define x21  r2
#define x22  r4
#define x23  r6
#define x24  r8
#define x25  r10
#define x26  r1
#define x27  r3
#define x28  r5
#define x29  r7
#define x2a  r9
#define x2b  r11

; an intentionally arrangement of registers to facilitate movw
#define x30 r22
#define x35 r20
#define x3a r18
#define x33 r16
#define x38 r14
#define x31 r12
#define x36 r23
#define x3b r21
#define x34 r19
#define x39 r17
#define x32 r15
#define x37 r13

#define t0j  r24
#define t1j  r25
#define x0j  r25
#define x1j  r27

#include "assist.h"

.macro Sbox i0, i1, i2, i3
    ldi  t0j,  0xFF
    eor  \i0,  t0j
    mov  t0j,  \i1
    and  \i1,   \i0
    eor  \i1,   \i2
    or   \i2,   t0j
    eor  \i0,   \i3
    eor  \i2,   \i0
    eor  t0j,   \i3
    and  \i0,   \i1
    eor  \i3,   \i1
    eor  \i0,   t0j
    and  t0j,  \i2
    eor  \i1,   t0j
.endm

.macro OneColumn i0, i1, i2, i3
    ld   \i0, Y
    ldd  \i1, Y + ROW_INBYTES
    Sbox \i0, \i1, \i2, \i3
    st   Y+, \i0
    rol  \i1                     ; ShiftRows -- Row 1 <<< 1
    std  Y + ROW_INBYTES -1, \i1
.endm

Permutation:
    PUSH_CONFLICT
    mov rcnt, rn

    ldi YH, hi8(SRAM_STATE + 2 * ROW_INBYTES)
    ldi YL, lo8(SRAM_STATE + 2 * ROW_INBYTES)
    ld  x20, Y+
    ld  x21, Y+
    ld  x22, Y+
    ld  x23, Y+
    ld  x24, Y+
    ld  x25, Y+
    ld  x26, Y+
    ld  x27, Y+
    ld  x28, Y+
    ld  x29, Y+
    ld  x2a, Y+
    ld  x2b, Y+
    ld  x30, Y+
    ld  x31, Y+
    ld  x32, Y+
    ld  x33, Y+
    ld  x34, Y+
    ld  x35, Y+
    ld  x36, Y+
    ld  x37, Y+
    ld  x38, Y+
    ld  x39, Y+
    ld  x3a, Y+
    ld  x3b, Y+

    ldi  ZL, lo8(RC_LFSR7)
    ldi  ZH, hi8(RC_LFSR7)

round_loop_start:
	; AddRC
	lpm  t0j,  Z+
    ldi  YH, hi8(SRAM_STATE)
    ldi  YL, lo8(SRAM_STATE)
    ld   x0j, Y
    eor  x0j, t0j

    ldd  x1j, Y + ROW_INBYTES
    Sbox x0j, x1j, x20, x30
    st   Y+, x0j
    lsl  x1j ; ShiftRows -- Row 1 <<< 1
    std  Y + ROW_INBYTES -1, x1j

    OneColumn x0j, x1j, x21, x31
    OneColumn x0j, x1j, x22, x32
    OneColumn x0j, x1j, x23, x33
    OneColumn x0j, x1j, x24, x34
    OneColumn x0j, x1j, x25, x35
    OneColumn x0j, x1j, x26, x36
    OneColumn x0j, x1j, x27, x37
    OneColumn x0j, x1j, x28, x38
    OneColumn x0j, x1j, x29, x39
    OneColumn x0j, x1j, x2a, x3a
    OneColumn x0j, x1j, x2b, x3b

    ld  x1j, Y
    eor t0j, t0j
    adc x1j, t0j
    st  Y,   x1j

    ;  b  a  9  8  7  6  5  4  3  2  1  0
    ; -- -- -- -- -- -- -- -- -- -- -- x- 0
    ; -- -- -- -- -- -- -- -- -- -- -- x' 0
    ; -- -- -- -- -- -- -- -- -- -- x- -- 1
    ; -- -- -- -- x' -- -- -- -- -- -- -- 7
    ;  4  3  2  1  0  b  a  9  8  7  6  5

    ; ShiftRows -- the last two rows
    ; <<< 8
	; b a 9 8 7 6 5 4 3 2 1 0 => a 9 8 7 6 5 4 3 2 1 0 b
    movw t0j, x25  ; t1j:t0j <= x2b:x25
    movw x25, x24  ; x2b:x25 <= x2a:x24
    movw x24, x23  ; x2a:x24 <= x29:x23
    movw x23, x22  ; x29:x23 <= x28:x22
    movw x22, x21  ; x28:x22 <= x27:x21
    movw x21, x20  ; x27:x21 <= x26:x20
    mov  x26, t0j  ; x26 <= x25
    mov  x20, t1j  ; x20 <= x2b

    ; >>> 1
    mov  t0j, x3b
    ror  t0j
    ror  x3a
    ror  x39
    ror  x38
    ror  x37
    ror  x36
    ror  x35
    ror  x34
    ror  x33
    ror  x32
    ror  x31
    ror  x30
    ror  x3b
    ; <<< 56
    ; b a 9 8 7 6 5 4 3 2 1 0 => 4 3 2 1 0 b a 9 8 7 6 5
    ; mov  x3j, x30
    ; mov  x30, x35
    ; mov  x35, x3a
    ; mov  x3a, x33
    ; mov  x33, x38
    ; mov  x38, x31
    ; mov  x31, x36
    ; mov  x36, x3b
    ; mov  x3b, x34
    ; mov  x34, x39
    ; mov  x39, x32
    ; mov  x32, x37
    ; mov  x37, x3j
    ; an intentionally arrangement of registers to facilitate movw
    ; x30 r22
    ; x35 r20
    ; x3a r18
    ; x33 r16
    ; x38 r14
    ; x31 r12
    ; x36 r23
    ; x3b r21
    ; x34 r19
    ; x39 r17
    ; x32 r15
    ; x37 r13
    movw t0j, x30 ; t1j:t0j <= x36:x30
    movw x30, x35 ; x36:x30 <= x3b:x35
    movw x35, x3a ; x3b:x35 <= x34:x3a
    movw x3a, x33 ; x34:x3a <= x39:x33
    movw x33, x38 ; x39:x33 <= x32:x38
    movw x38, x31 ; x32:x38 <= x37:x31
    mov  x31, t1j ; x31 <= x36
    mov  x37, t0j ; x37 <= x30

    dec rcnt
    breq round_loop_end
    jmp round_loop_start

round_loop_end:

    ldi YH, hi8(SRAM_STATE + 2 * ROW_INBYTES)
    ldi YL, lo8(SRAM_STATE + 2 * ROW_INBYTES)
    st   Y+, x20
    st   Y+, x21
    st   Y+, x22
    st   Y+, x23
    st   Y+, x24
    st   Y+, x25
    st   Y+, x26
    st   Y+, x27
    st   Y+, x28
    st   Y+, x29
    st   Y+, x2a
    st   Y+, x2b
    st   Y+, x30
    st   Y+, x31
    st   Y+, x32
    st   Y+, x33
    st   Y+, x34
    st   Y+, x35
    st   Y+, x36
    st   Y+, x37
    st   Y+, x38
    st   Y+, x39
    st   Y+, x3a
    st   Y+, x3b

    POP_CONFLICT
ret

RC_LFSR7:
.byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x41, 0x03
.byte 0x06, 0x0c, 0x18, 0x30, 0x61, 0x42, 0x05, 0x0a
.byte 0x14, 0x28, 0x51, 0x23, 0x47, 0x0f, 0x1e, 0x3c
.byte 0x79, 0x72, 0x64, 0x48, 0x11, 0x22, 0x45, 0x0b
.byte 0x16, 0x2c, 0x59, 0x33, 0x67, 0x4e, 0x1d, 0x3a
.byte 0x75, 0x6a, 0x54, 0x29, 0x53, 0x27, 0x4f, 0x1f
.byte 0x3e, 0x7d, 0x7a, 0x74, 0x68, 0x50, 0x21, 0x43
.byte 0x07, 0x0e, 0x1c, 0x38, 0x71, 0x62, 0x44, 0x09
.byte 0x12, 0x24, 0x49, 0x13, 0x26, 0x4d, 0x1b, 0x36
.byte 0x6d, 0x5a, 0x35, 0x6b, 0x56, 0x2d, 0x5b, 0x37
.byte 0x6f, 0x5e, 0x3d, 0x7b, 0x76, 0x6c, 0x58, 0x31
.byte 0x63, 0x46, 0x0d, 0x1a, 0x34, 0x69, 0x52, 0x25
.byte 0x4b, 0x17, 0x2e, 0x5d, 0x3b, 0x77, 0x6e, 0x5c
.byte 0x39, 0x73, 0x66, 0x4c, 0x19, 0x32, 0x65, 0x4a
.byte 0x15, 0x2a, 0x55, 0x2b, 0x57, 0x2f, 0x5f, 0x3f
.byte 0x7f, 0x7e, 0x7c, 0x78, 0x70, 0x60, 0x40, 0x00