@ =======================================================================
@ Combined Saturnin-CTR-Cascade and Saturnin-Hash implementation for
@ ARM Cortex-M3 CPU. This implements the API described in saturnin.h.
@ =======================================================================

	.syntax	unified
	.cpu	cortex-m3
	.file	"saturnin_m3.s"
	.text

@ =======================================================================
@ void saturnin_key_expand(uint32_t *keybuf, const uint8_t *key)
@
@   Read 32-byte key (possibly unaligned) and convert it to the internal
@   bitslice-32 representation (key bytes yield sixteen 16-bit registers
@   z0 to z15; internal representation is eight 32-bit words k0..k7
@   such that k_i = z_i | (z_{i+8} << 16)).
@   The words k_i are written in keybuf[]. Then they are written again,
@   this time with the extra "key rotation". In total, sixteen 32-bit
@   words are written in keybuf[].
@ =======================================================================
	.align	1
	.global	saturnin_key_expand
	.thumb
	.thumb_func
	.type	saturnin_key_expand, %function
saturnin_key_expand:
	push	{r4, r5, r6, r7, r8, lr}

	@ Set r12 = 0x001F001F and r8 = 0x07FF07FF
	movs	r2, #0x1F
	orr	r12, r2, r2, lsl #16
	mvn	r8, r12, lsl #11

	@ Make words q0...q3
	ldr	r2, [r1]
	ldr	r3, [r1, #16]
	movs	r4, r2
	bfi	r4, r3, #16, #16
	bfc	r3, #0, #16
	orr	r5, r3, r2, lsr #16
	ldr	r2, [r1, #4]
	ldr	r3, [r1, #20]
	movs	r6, r2
	bfi	r6, r3, #16, #16
	bfc	r3, #0, #16
	orr	r7, r3, r2, lsr #16
	stm	r0!, {r4, r5, r6, r7}

	@ Apply rotations on q0..q3
	and	r2, r8, r4, lsr #5
	and	r3, r12, r4
	orr	r4, r2, r3, lsl #11
	and	r2, r8, r5, lsr #5
	and	r3, r12, r5
	orr	r5, r2, r3, lsl #11
	and	r2, r8, r6, lsr #5
	and	r3, r12, r6
	orr	r6, r2, r3, lsl #11
	and	r2, r8, r7, lsr #5
	and	r3, r12, r7
	orr	r7, r2, r3, lsl #11
	adds	r0, #16
	stm	r0!, {r4, r5, r6, r7}

	@ Make words q4...q7
	ldr	r2, [r1, #8]
	ldr	r3, [r1, #24]
	movs	r4, r2
	bfi	r4, r3, #16, #16
	bfc	r3, #0, #16
	orr	r5, r3, r2, lsr #16
	ldr	r2, [r1, #12]
	ldr	r3, [r1, #28]
	movs	r6, r2
	bfi	r6, r3, #16, #16
	bfc	r3, #0, #16
	orr	r7, r3, r2, lsr #16
	subs	r0, #32
	stm	r0!, {r4, r5, r6, r7}

	@ Apply rotations on q4..q7
	and	r2, r8, r4, lsr #5
	and	r3, r12, r4
	orr	r4, r2, r3, lsl #11
	and	r2, r8, r5, lsr #5
	and	r3, r12, r5
	orr	r5, r2, r3, lsl #11
	and	r2, r8, r6, lsr #5
	and	r3, r12, r6
	orr	r6, r2, r3, lsl #11
	and	r2, r8, r7, lsr #5
	and	r3, r12, r7
	orr	r7, r2, r3, lsl #11
	add	r0, #16
	stm	r0!, {r4, r5, r6, r7}

	pop	{r4, r5, r6, r7, r8, pc}
	.size	saturnin_key_expand, .-saturnin_key_expand

@ =======================================================================
@ void saturnin_block_encrypt(int R, const uint32_t *rc,
@                             uint32_t *keybuf, const uint8_t *buf)
@
@   Perform one block encryption:
@     R        Number of super-rounds (typically 10 or 16); must be even.
@     rc       The round constants, in usage order.
@     keybuf   The key and the rotated key.
@     buf      The input/output block (possibly unaligned).
@
@   The key and rotated key are in internal representation, as output
@   by saturnin_key_expand(). Each round constant is a 32-bit word;
@   low 16 bits are RC0, high 16 bits are RC1.
@ =======================================================================
	.align	1
	.global	saturnin_block_encrypt
	.thumb
	.thumb_func
	.type	saturnin_block_encrypt, %function
saturnin_block_encrypt:
	push	{r0, r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, lr}

	@ Conventions:
	@   r0..r7    state values q0..q7
	@   [sp]      super-round counter
	@   [sp+4]    pointer to next round constant
	@   [sp+8]    pointer to key buf
	@   [sp+12]   pointer to data block
	@ Reserved registers are r9, r13 (sp) and r15 (pc).
	@ Scratch registers are r8, r10, r11, r12 and r14.

	@ Read block into q0..q7
	ldr	r0, [r3, #8]
	ldr	r1, [r3, #24]
	movs	r4, r0
	bfi	r4, r1, #16, #16
	bfc	r1, #0, #16
	orr	r5, r1, r0, lsr #16
	ldr	r0, [r3, #12]
	ldr	r1, [r3, #28]
	movs	r6, r0
	bfi	r6, r1, #16, #16
	bfc	r1, #0, #16
	orr	r7, r1, r0, lsr #16
	ldr	r0, [r3, #4]
	ldr	r1, [r3, #20]
	movs	r2, r0
	bfi	r2, r1, #16, #16
	bfc	r1, #0, #16
	orr	r12, r1, r0, lsr #16
	ldr	r0, [r3, #0]
	ldr	r1, [r3, #16]
	movs	r14, r0
	bfi	r14, r1, #16, #16
	bfc	r1, #0, #16
	orr	r1, r1, r0, lsr #16
	movs	r3, r12
	movs	r0, r14

	@ XOR with key
	ldr	r8, [sp, #8]
	ldm	r8!, {r10, r11, r12, r14}
	eors	r0, r10
	eors	r1, r11
	eors	r2, r12
	eors	r3, r14
	ldm	r8!, {r10, r11, r12, r14}
	eors	r4, r10
	eors	r5, r11
	eors	r6, r12
	eors	r7, r14

.Lsaturnin_block_encrypt_loop:
	@ ============= Even round

	@ Apply Sbox
	ands	r8, r1, r2
	eors	r8, r0, r8     @ r8 r1 r2 r3
	orrs	r10, r8, r3
	eors	r10, r1, r10   @ r8 r10 r2 r3
	orrs	r11, r10, r2
	eors	r11, r3, r11   @ r8 r10 r2 r11
	ands	r1, r10, r11
	eors	r1, r2, r1     @ r8 r10 r1 r11
	orrs	r0, r8, r1
	eors	r0, r10, r0    @ r8 r0 r1 r11
	orrs	r3, r0, r11
	eors	r3, r8, r3     @ r3 r0 r1 r11
	movs	r2, r11        @ r3 r0 r1 r2

	ands	r8, r5, r6
	eors	r8, r4, r8     @ r8 r5 r6 r7
	orrs	r10, r8, r7
	eors	r5, r5, r10    @ r8 r5 r6 r7
	orrs	r4, r5, r6
	eors	r4, r7, r4     @ r8 r5 r6 r4
	ands	r7, r5, r4
	eors	r7, r6, r7     @ r8 r5 r7 r4
	orrs	r11, r8, r7
	eors	r5, r5, r11    @ r8 r5 r7 r4
	orrs	r6, r5, r4
	eors	r6, r8, r6     @ r6 r5 r7 r4

	@ Apply MDS
	@ Initial: state is: r0 r1 r2 r3 r4 r5 r6 r7

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r0, r4
	eors	r1, r5
	eors	r2, r6
	eors	r3, r7

	@ MUL(q4, q5, q6, q7);
	eors	r4, r5     @ r0 r1 r2 r3 r5 r6 r7 r4

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r5, r5, r0, ror #16
	eors	r6, r6, r1, ror #16
	eors	r7, r7, r2, ror #16
	eors	r4, r4, r3, ror #16

	@ MUL(q0, q1, q2, q3);
	eors	r0, r1     @ r1 r2 r3 r0 r5 r6 r7 r4
	@ MUL(q0, q1, q2, q3);
	eors	r1, r2     @ r2 r3 r0 r1 r5 r6 r7 r4

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r2, r5
	eors	r3, r6
	eors	r0, r7
	eors	r1, r4

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r5, r5, r2, ror #16
	eors	r6, r6, r3, ror #16
	eors	r7, r7, r0, ror #16
	eors	r4, r4, r1, ror #16

	@ At this point, we have the following mapping:
	@   q0  in  r2
	@   q1  in  r3
	@   q2  in  r0
	@   q3  in  r1
	@   q4  in  r5
	@   q5  in  r6
	@   q6  in  r7
	@   q7  in  r4

	@ ============= Odd round, r = 1 mod 4

	@ Apply Sbox
	ands	r8, r3, r0
	eors	r8, r2, r8     @ r8 r3 r0 r1
	orrs	r10, r8, r1
	eors	r10, r3, r10   @ r8 r10 r0 r1
	orrs	r11, r10, r0
	eors	r11, r1, r11   @ r8 r10 r0 r11
	ands	r3, r10, r11
	eors	r3, r0, r3     @ r8 r10 r3 r11
	orrs	r2, r8, r3
	eors	r2, r10, r2    @ r8 r2 r3 r11
	orrs	r1, r2, r11
	eors	r1, r8, r1     @ r1 r2 r3 r11
	movs	r0, r11        @ r1 r2 r3 r0

	ands	r8, r6, r7
	eors	r8, r5, r8     @ r8 r6 r7 r4
	orrs	r10, r8, r4
	eors	r6, r6, r10    @ r8 r6 r7 r4
	orrs	r5, r6, r7
	eors	r5, r4, r5     @ r8 r6 r7 r5
	ands	r4, r6, r5
	eors	r4, r7, r4     @ r8 r6 r4 r5
	orrs	r11, r8, r4
	eors	r6, r6, r11    @ r8 r6 r4 r5
	orrs	r7, r6, r5
	eors	r7, r8, r7     @ r7 r6 r4 r5

	@ Apply SR_slice
	movw	r8, #0x3333
	movw	r10, #0xCCCC
	ands	r11, r8, r2, lsr #18
	ands	r12, r10, r2, lsr #14
	orrs	r12, r11, r12
	bfi	r2, r12, #16, #16
	ands	r11, r8, r3, lsr #18
	ands	r12, r10, r3, lsr #14
	orrs	r12, r11, r12
	bfi	r3, r12, #16, #16
	ands	r11, r8, r0, lsr #18
	ands	r12, r10, r0, lsr #14
	orrs	r12, r11, r12
	bfi	r0, r12, #16, #16
	ands	r11, r8, r1, lsr #18
	ands	r12, r10, r1, lsr #14
	orrs	r12, r11, r12
	bfi	r1, r12, #16, #16
	movw	r8, #0x1111
	movw	r10, #0xEEEE
	ands	r11, r10, r5, lsl #1      @ (x & 0x00007777) << 1 -> r11
	ands	r12, r8, r5, lsr #3       @ (x >> 3) & 0x00001111 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r5, lsr #16      @ (x >> 16) & 0x00001111 -> r11
	ands	r12, r10, r5, lsr #16     @ (x >> 16) & 0x0000EEEE -> r12
	orrs	r5, r14, r11, lsl #19
	orrs	r5, r5, r12, lsl #15
	ands	r11, r10, r6, lsl #1      @ (x & 0x00007777) << 1 -> r11
	ands	r12, r8, r6, lsr #3       @ (x >> 3) & 0x00001111 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r6, lsr #16      @ (x >> 16) & 0x00001111 -> r11
	ands	r12, r10, r6, lsr #16     @ (x >> 16) & 0x0000EEEE -> r12
	orrs	r6, r14, r11, lsl #19
	orrs	r6, r6, r12, lsl #15
	ands	r11, r10, r7, lsl #1      @ (x & 0x00007777) << 1 -> r11
	ands	r12, r8, r7, lsr #3       @ (x >> 3) & 0x00001111 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r7, lsr #16      @ (x >> 16) & 0x00001111 -> r11
	ands	r12, r10, r7, lsr #16     @ (x >> 16) & 0x0000EEEE -> r12
	orrs	r7, r14, r11, lsl #19
	orrs	r7, r7, r12, lsl #15
	ands	r11, r10, r4, lsl #1      @ (x & 0x00007777) << 1 -> r11
	ands	r12, r8, r4, lsr #3       @ (x >> 3) & 0x00001111 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r4, lsr #16      @ (x >> 16) & 0x00001111 -> r11
	ands	r12, r10, r4, lsr #16     @ (x >> 16) & 0x0000EEEE -> r12
	orrs	r4, r14, r11, lsl #19
	orrs	r4, r4, r12, lsl #15

	@ Apply MDS
	@ Initial: state is: r2 r3 r0 r1 r5 r6 r7 r4

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r2, r5
	eors	r3, r6
	eors	r0, r7
	eors	r1, r4

	@ MUL(q4, q5, q6, q7);
	eors	r5, r6     @ r2 r3 r0 r1 r6 r7 r4 r5

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r6, r6, r2, ror #16
	eors	r7, r7, r3, ror #16
	eors	r4, r4, r0, ror #16
	eors	r5, r5, r1, ror #16

	@ MUL(q0, q1, q2, q3);
	eors	r2, r3     @ r3 r0 r1 r2 r6 r7 r4 r5
	@ MUL(q0, q1, q2, q3);
	eors	r3, r0     @ r0 r1 r2 r3 r6 r7 r4 r5

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r0, r6
	eors	r1, r7
	eors	r2, r4
	eors	r3, r5

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r6, r6, r0, ror #16
	eors	r7, r7, r1, ror #16
	eors	r4, r4, r2, ror #16
	eors	r5, r5, r3, ror #16

	@ At this point, we have the following mapping:
	@   q0  in  r0
	@   q1  in  r1
	@   q2  in  r2
	@   q3  in  r3
	@   q4  in  r6
	@   q5  in  r7
	@   q6  in  r4
	@   q7  in  r5

	@ Apply SR_slice_inv
	movw	r8, #0x3333
	movw	r10, #0xCCCC
	ands	r11, r8, r0, lsr #18
	ands	r12, r10, r0, lsr #14
	orrs	r12, r11, r12
	bfi	r0, r12, #16, #16
	ands	r11, r8, r1, lsr #18
	ands	r12, r10, r1, lsr #14
	orrs	r12, r11, r12
	bfi	r1, r12, #16, #16
	ands	r11, r8, r2, lsr #18
	ands	r12, r10, r2, lsr #14
	orrs	r12, r11, r12
	bfi	r2, r12, #16, #16
	ands	r11, r8, r3, lsr #18
	ands	r12, r10, r3, lsr #14
	orrs	r12, r11, r12
	bfi	r3, r12, #16, #16
	movw	r8, #0x7777
	movw	r10, #0x8888
	ands	r11, r10, r6, lsl #3      @ (x & 0x00001111) << 3 -> r11
	ands	r12, r8, r6, lsr #1       @ (x >> 1) & 0x00007777 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r6, lsr #16      @ (x >> 16) & 0x00007777 -> r11
	ands	r12, r10, r6, lsr #16     @ (x >> 16) & 0x00008888 -> r12
	orrs	r6, r14, r11, lsl #17
	orrs	r6, r6, r12, lsl #13
	ands	r11, r10, r7, lsl #3      @ (x & 0x00001111) << 3 -> r11
	ands	r12, r8, r7, lsr #1       @ (x >> 1) & 0x00007777 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r7, lsr #16      @ (x >> 16) & 0x00007777 -> r11
	ands	r12, r10, r7, lsr #16     @ (x >> 16) & 0x00008888 -> r12
	orrs	r7, r14, r11, lsl #17
	orrs	r7, r7, r12, lsl #13
	ands	r11, r10, r4, lsl #3      @ (x & 0x00001111) << 3 -> r11
	ands	r12, r8, r4, lsr #1       @ (x >> 1) & 0x00007777 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r4, lsr #16      @ (x >> 16) & 0x00007777 -> r11
	ands	r12, r10, r4, lsr #16     @ (x >> 16) & 0x00008888 -> r12
	orrs	r4, r14, r11, lsl #17
	orrs	r4, r4, r12, lsl #13
	ands	r11, r10, r5, lsl #3      @ (x & 0x00001111) << 3 -> r11
	ands	r12, r8, r5, lsr #1       @ (x >> 1) & 0x00007777 -> r12
	orrs	r14, r11, r12
	ands	r11, r8, r5, lsr #16      @ (x >> 16) & 0x00007777 -> r11
	ands	r12, r10, r5, lsr #16     @ (x >> 16) & 0x00008888 -> r12
	orrs	r5, r14, r11, lsl #17
	orrs	r5, r5, r12, lsl #13

	@ XOR round constant
	ldr	r8, [sp, #4]
	ldm	r8!, {r10}
	str	r8, [sp, #4]
	eors	r0, r10

	@ XOR rotated key
	ldr	r8, [sp, #8]
	adds	r8, #32
	ldm	r8!, {r10, r11, r12, r14}
	eors	r0, r10
	eors	r1, r11
	eors	r2, r12
	eors	r3, r14
	ldm	r8!, {r10, r11, r12, r14}
	eors	r6, r10
	eors	r7, r11
	eors	r4, r12
	eors	r5, r14

	@ ============= Even round

	@ Apply Sbox
	ands	r8, r1, r2
	eors	r8, r0, r8     @ r8 r1 r2 r3
	orrs	r10, r8, r3
	eors	r10, r1, r10   @ r8 r10 r2 r3
	orrs	r11, r10, r2
	eors	r11, r3, r11   @ r8 r10 r2 r11
	ands	r1, r10, r11
	eors	r1, r2, r1     @ r8 r10 r1 r11
	orrs	r0, r8, r1
	eors	r0, r10, r0    @ r8 r0 r1 r11
	orrs	r3, r0, r11
	eors	r3, r8, r3     @ r3 r0 r1 r11
	movs	r2, r11        @ r3 r0 r1 r2

	ands	r8, r7, r4
	eors	r8, r6, r8     @ r8 r7 r4 r5
	orrs	r10, r8, r5
	eors	r7, r7, r10    @ r8 r7 r4 r5
	orrs	r6, r7, r4
	eors	r6, r5, r6     @ r8 r7 r4 r6
	ands	r5, r7, r6
	eors	r5, r4, r5     @ r8 r7 r5 r6
	orrs	r11, r8, r5
	eors	r7, r7, r11    @ r8 r7 r5 r6
	orrs	r4, r7, r6
	eors	r4, r8, r4     @ r4 r7 r5 r6

	@ Apply MDS
	@ Initial: state is: r0 r1 r2 r3 r6 r7 r4 r5

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r0, r6
	eors	r1, r7
	eors	r2, r4
	eors	r3, r5

	@ MUL(q4, q5, q6, q7);
	eors	r6, r7     @ r0 r1 r2 r3 r7 r4 r5 r6

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r7, r7, r0, ror #16
	eors	r4, r4, r1, ror #16
	eors	r5, r5, r2, ror #16
	eors	r6, r6, r3, ror #16

	@ MUL(q0, q1, q2, q3);
	eors	r0, r1     @ r1 r2 r3 r0 r7 r4 r5 r6
	@ MUL(q0, q1, q2, q3);
	eors	r1, r2     @ r2 r3 r0 r1 r7 r4 r5 r6

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r2, r7
	eors	r3, r4
	eors	r0, r5
	eors	r1, r6

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r7, r7, r2, ror #16
	eors	r4, r4, r3, ror #16
	eors	r5, r5, r0, ror #16
	eors	r6, r6, r1, ror #16

	@ At this point, we have the following mapping:
	@   q0  in  r2
	@   q1  in  r3
	@   q2  in  r0
	@   q3  in  r1
	@   q4  in  r7
	@   q5  in  r4
	@   q6  in  r5
	@   q7  in  r6

	@ ============= Odd round, r = 3 mod 4

	@ Apply Sbox
	ands	r8, r3, r0
	eors	r8, r2, r8     @ r8 r3 r0 r1
	orrs	r10, r8, r1
	eors	r10, r3, r10   @ r8 r10 r0 r1
	orrs	r11, r10, r0
	eors	r11, r1, r11   @ r8 r10 r0 r11
	ands	r3, r10, r11
	eors	r3, r0, r3     @ r8 r10 r3 r11
	orrs	r2, r8, r3
	eors	r2, r10, r2    @ r8 r2 r3 r11
	orrs	r1, r2, r11
	eors	r1, r8, r1     @ r1 r2 r3 r11
	movs	r0, r11        @ r1 r2 r3 r0

	ands	r8, r4, r5
	eors	r8, r7, r8     @ r8 r4 r5 r6
	orrs	r10, r8, r6
	eors	r4, r4, r10    @ r8 r4 r5 r6
	orrs	r7, r4, r5
	eors	r7, r6, r7     @ r8 r4 r5 r7
	ands	r6, r4, r7
	eors	r6, r5, r6     @ r8 r4 r6 r7
	orrs	r11, r8, r6
	eors	r4, r4, r11    @ r8 r4 r6 r7
	orrs	r5, r4, r7
	eors	r5, r8, r5     @ r5 r4 r6 r7

	@ Apply SR_sheet
	rev	r8, r2
	bfi	r2, r8, #16, #16
	rev	r8, r3
	bfi	r3, r8, #16, #16
	rev	r8, r0
	bfi	r0, r8, #16, #16
	rev	r8, r1
	bfi	r1, r8, #16, #16

	uxth	r10, r7
	lsrs	r11, r7, #20
	lsls	r7, r7, #12
	bfi	r7, r11, #16, #12
	bfi	r7, r10, #4, #12
	orr	r7, r7, r10, lsr #12

	uxth	r10, r4
	lsrs	r11, r4, #20
	lsls	r4, r4, #12
	bfi	r4, r11, #16, #12
	bfi	r4, r10, #4, #12
	orr	r4, r4, r10, lsr #12

	uxth	r10, r5
	lsrs	r11, r5, #20
	lsls	r5, r5, #12
	bfi	r5, r11, #16, #12
	bfi	r5, r10, #4, #12
	orr	r5, r5, r10, lsr #12

	uxth	r10, r6
	lsrs	r11, r6, #20
	lsls	r6, r6, #12
	bfi	r6, r11, #16, #12
	bfi	r6, r10, #4, #12
	orr	r6, r6, r10, lsr #12

	@ Apply MDS
	@ Initial: state is: r2 r3 r0 r1 r7 r4 r5 r6

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r2, r7
	eors	r3, r4
	eors	r0, r5
	eors	r1, r6

	@ MUL(q4, q5, q6, q7);
	eors	r7, r4     @ r2 r3 r0 r1 r4 r5 r6 r7

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r4, r4, r2, ror #16
	eors	r5, r5, r3, ror #16
	eors	r6, r6, r0, ror #16
	eors	r7, r7, r1, ror #16

	@ MUL(q0, q1, q2, q3);
	eors	r2, r3     @ r3 r0 r1 r2 r4 r5 r6 r7
	@ MUL(q0, q1, q2, q3);
	eors	r3, r0     @ r0 r1 r2 r3 r4 r5 r6 r7

	@ q0 ^= q4; q1 ^= q5; q2 ^= q6; q3 ^= q7;
	eors	r0, r4
	eors	r1, r5
	eors	r2, r6
	eors	r3, r7

	@ q4 ^= SW(q0); q5 ^= SW(q1); q6 ^= SW(q2); q7 ^= SW(q3);
	eors	r4, r4, r0, ror #16
	eors	r5, r5, r1, ror #16
	eors	r6, r6, r2, ror #16
	eors	r7, r7, r3, ror #16

	@ At this point, we have the following mapping:
	@   q0  in  r0
	@   q1  in  r1
	@   q2  in  r2
	@   q3  in  r3
	@   q4  in  r4
	@   q5  in  r5
	@   q6  in  r6
	@   q7  in  r7

	@ Apply SR_sheet_inv
	rev	r8, r0
	bfi	r0, r8, #16, #16
	rev	r8, r1
	bfi	r1, r8, #16, #16
	rev	r8, r2
	bfi	r2, r8, #16, #16
	rev	r8, r3
	bfi	r3, r8, #16, #16

	lsrs	r10, r4, #4
	lsrs	r11, r4, #28
	bfi	r4, r4, #8, #4
	lsls	r4, r4, #4
	bfi	r4, r11, #16, #4
	bfi	r4, r10, #0, #12

	lsrs	r10, r5, #4
	lsrs	r11, r5, #28
	bfi	r5, r5, #8, #4
	lsls	r5, r5, #4
	bfi	r5, r11, #16, #4
	bfi	r5, r10, #0, #12

	lsrs	r10, r6, #4
	lsrs	r11, r6, #28
	bfi	r6, r6, #8, #4
	lsls	r6, r6, #4
	bfi	r6, r11, #16, #4
	bfi	r6, r10, #0, #12

	lsrs	r10, r7, #4
	lsrs	r11, r7, #28
	bfi	r7, r7, #8, #4
	lsls	r7, r7, #4
	bfi	r7, r11, #16, #4
	bfi	r7, r10, #0, #12

	@ XOR round constant
	ldr	r8, [sp, #4]
	ldm	r8!, {r10}
	str	r8, [sp, #4]
	eors	r0, r10

	@ XOR non-rotated key
	ldr	r8, [sp, #8]
	ldm	r8!, {r10, r11, r12, r14}
	eors	r0, r10
	eors	r1, r11
	eors	r2, r12
	eors	r3, r14
	ldm	r8!, {r10, r11, r12, r14}
	eors	r4, r10
	eors	r5, r11
	eors	r6, r12
	eors	r7, r14

	@ Loop for sufficiently many rounds.
	ldr	r8, [sp]
	subs	r8, #2
	str	r8, [sp]
	bne	.Lsaturnin_block_encrypt_loop

	@ Encode back the final state.
	ldr	r8, [sp, #12]
	strh	r0, [r8]
	strh	r1, [r8, #2]
	strh	r2, [r8, #4]
	strh	r3, [r8, #6]
	strh	r4, [r8, #8]
	strh	r5, [r8, #10]
	strh	r6, [r8, #12]
	strh	r7, [r8, #14]
	lsrs	r0, r0, #16
	lsrs	r1, r1, #16
	lsrs	r2, r2, #16
	lsrs	r3, r3, #16
	lsrs	r4, r4, #16
	lsrs	r5, r5, #16
	lsrs	r6, r6, #16
	lsrs	r7, r7, #16
	strh	r0, [r8, #16]
	strh	r1, [r8, #18]
	strh	r2, [r8, #20]
	strh	r3, [r8, #22]
	strh	r4, [r8, #24]
	strh	r5, [r8, #26]
	strh	r6, [r8, #28]
	strh	r7, [r8, #30]

	pop	{r0, r1, r2, r3, r4, r5, r6, r7, r8, r10, r11, pc}
	.size	saturnin_block_encrypt, .-saturnin_block_encrypt

@ =======================================================================
@ void xor32_aligned(uint8_t *dst, const uint8_t *src)
@
@   XOR the 32 bytes from src[] into the 32 bytes at dst[]. Both src[]
@   and dst[] MUST be 32-byte aligned.
@ =======================================================================
	.align	1
	.thumb
	.thumb_func
	.type	xor32_aligned, %function
xor32_aligned:
	push	{r0, r4, r5, r6, r7, lr}   @ r0 is for stack 8-byte alignment
	ldm	r0, {r2, r3, r4, r5}
	ldm	r1!, {r6, r7, r12, r14}
	eors	r2, r6
	eors	r3, r7
	eors	r4, r12
	eors	r5, r14
	stm	r0!, {r2, r3, r4, r5}
	ldm	r0, {r2, r3, r4, r5}
	ldm	r1!, {r6, r7, r12, r14}
	eors	r2, r6
	eors	r3, r7
	eors	r4, r12
	eors	r5, r14
	stm	r0!, {r2, r3, r4, r5}
	pop	{r0, r4, r5, r6, r7, pc}
	.size	xor32_aligned, .-xor32_aligned

@ =======================================================================
@ Saturnin-CTR-Cascade (AEAD mode).
@
@ The context is the following structure (declared in saturnin.h):
@
@   typedef struct {
@           uint32_t keybuf[16];
@           uint8_t cascade[32];
@           uint8_t ctr[32];
@           uint8_t buf[32];
@           size_t ptr;
@   } saturnin_aead_context;
@
@ =======================================================================

@ =======================================================================
@ void do_cascade(saturnin_aead_context *cc, const uint32_t *rc)
@
@   Apply the Cascade for one more block. The block is cc->buf[].
@   This uses the current cascade[] field as key for a Saturnin block
@   encryption (10 super-rounds); the input of that block encryption
@   is cc->buf[]; the XOR of the output with the input block becomes the
@   new cascade[] field.
@ =======================================================================
	.align	1
	.thumb
	.thumb_func
	.type	do_cascade_aligned, %function
do_cascade_aligned:
	push	{r1, r4, r5, lr}

	@ We set r4 to the cascade[] field within the context,
	@ and r5 to the buf[] address.
	adds	r5, r0, #128
	adds	r4, r0, #64

	@ Expand the current cascade[] as a key into a stack allocated
	@ keybuf[].
	sub	sp, sp, #64
	mov	r0, sp
	movs	r1, r4
	bl	saturnin_key_expand

	@ Copy the block[] into the cascade[] field. We assume that both
	@ are aligned.
	ldm	r5!, {r0, r1, r2, r3}
	stm	r4!, {r0, r1, r2, r3}
	ldm	r5!, {r0, r1, r2, r3}
	stm	r4!, {r0, r1, r2, r3}
	subs	r5, #32
	subs	r4, #32

	@ Perform the block encryption.
	movs	r0, #10
	ldr	r1, [sp, #64]
	mov	r2, sp
	movs	r3, r4
	bl	saturnin_block_encrypt

	@ XOR the input block with the encryption output.
	movs	r0, r4
	movs	r1, r5
	bl	xor32_aligned

	@ Exit. We must add 64 for the stack-allocated keybuf[], and 4 for
	@ the r1 which was saved.
	add	sp, sp, #68
	pop	{r4, r5, pc}
	.size	do_cascade_aligned, .-do_cascade_aligned

@ =======================================================================
@ void saturnin_aead_init(saturnin_aead_context *cc,
@                 const void *key, size_t key_len)
@
@   Initialize the context with the provided key. The 'key_len' field
@   is ignored (it is up to the caller to make sure that only 32-byte
@   keys are provided).
@ =======================================================================
	.align	1
	.global	saturnin_aead_init
	.thumb
	.thumb_func
	.type	saturnin_aead_init, %function
saturnin_aead_init:
	@ Initialization is just expanding the key into the keybuf[]
	@ field of the context, which is at offset 0.
	b	saturnin_key_expand
	.size	saturnin_aead_init, .-saturnin_aead_init

@ =======================================================================
@ Precomputed round constants.
@ =======================================================================
	.align	2

	.type	.RC_10_1, %object
	.size	.RC_10_1, 40
.RC_10_1:
	.word	0x4EB026C2
	.word	0x90595303
	.word	0xAA8FE632
	.word	0xFE928A92
	.word	0x4115A419
	.word	0x93539532
	.word	0x5DB1CC4E
	.word	0x541515CA
	.word	0xBD1F55A8
	.word	0x5A6E1A0D

	.type	.RC_10_2, %object
	.size	.RC_10_2, 40
.RC_10_2:
	.word	0x4E4526B5
	.word	0xA3565FF0
	.word	0x0F8F20D8
	.word	0x0B54BEE1
	.word	0x7D1A6C9D
	.word	0x17A6280A
	.word	0xAA46C986
	.word	0xC1199062
	.word	0x182C5CDE
	.word	0xA00D53FE

	.type	.RC_10_3, %object
	.size	.RC_10_3, 40
.RC_10_3:
	.word	0x4E162698
	.word	0xB2535BA1
	.word	0x6C8F9D65
	.word	0x5816AD30
	.word	0x691FD4FA
	.word	0x6BF5BCF9
	.word	0xF8EB3525
	.word	0xB21DECFA
	.word	0x7B3DA417
	.word	0xF62C94B4

	.type	.RC_10_4, %object
	.size	.RC_10_4, 40
.RC_10_4:
	.word	0x4FAF265B
	.word	0xC5484616
	.word	0x45DCAD21
	.word	0xE08BD607
	.word	0x0504FDB8
	.word	0x1E1F5257
	.word	0x45FBC216
	.word	0xEB529B1F
	.word	0x52194E32
	.word	0x5498C018

	.type	.RC_10_5, %object
	.size	.RC_10_5, 40
.RC_10_5:
	.word	0x4FFC2676
	.word	0xD44D4247
	.word	0x26DC109C
	.word	0xB3C9C5D6
	.word	0x110145DF
	.word	0x624CC6A4
	.word	0x17563EB5
	.word	0x9856E787
	.word	0x3108B6FB
	.word	0x02B90752

	.type	.RC_16_7, %object
	.size	.RC_16_7, 64
.RC_16_7:
	.word	0x3FBA180C
	.word	0x563AB9AB
	.word	0x125EA5EF
	.word	0x859DA26C
	.word	0xB8CF779B
	.word	0x7D4DE793
	.word	0x07EFB49F
	.word	0x8D525306
	.word	0x1E08E6AB
	.word	0x41729F87
	.word	0x8C4AEF0A
	.word	0x4AA0C9A7
	.word	0xD93A95EF
	.word	0xBB00D2AF
	.word	0xB62C5BF0
	.word	0x386D94D8

	.type	.RC_16_8, %object
	.size	.RC_16_8, 64
.RC_16_8:
	.word	0x3C9B19A7
	.word	0xA9098694
	.word	0x23F878DA
	.word	0xA7B647D3
	.word	0x74FC9D78
	.word	0xEACAAE11
	.word	0x2F31A677
	.word	0x4CC8C054
	.word	0x2F51CA05
	.word	0x5268F195
	.word	0x4F5B8A2B
	.word	0xF614B4AC
	.word	0xF1D95401
	.word	0x764D2568
	.word	0x6A493611
	.word	0x8EEF9C3E

@ =======================================================================
@ void saturnin_aead_reset(saturnin_aead_context *cc,
@                 const void *nonce, size_t nonce_len)
@
@   Start a new message, with the provided nonce. Nonce length is between
@   0 and 20 bytes (inclusive); the nonce[] buffer is not necessarily
@   aligned.
@ =======================================================================
	.align	1
	.global	saturnin_aead_reset
	.thumb
	.thumb_func
	.type	saturnin_aead_reset, %function
saturnin_aead_reset:
	push	{r4, r5, r6, lr}
	adds	r0, #96   @ offset of ctr[] in the context
	movs	r4, r0
	movs	r6, r2    @ nonce_len

	@ Copy the nonce into the ctr[] buffer, padded.
	bl	memcpy

	@ Pad the nonce.
	adds	r0, r4, r6
	movs	r1, #0x80
	strb	r1, [r0]
	adds	r0, #1
	eors	r1, r1
	movs	r2, #31
	subs	r2, r6
	bl	memset

	@ Copy the ctr[] field to the cascade[] field; both are aligned.
	subs	r5, r4, #32
	ldm	r4!, {r0, r1, r2, r3}
	stm	r5!, {r0, r1, r2, r3}
	ldm	r4!, {r0, r1, r2, r3}
	stm	r5!, {r0, r1, r2, r3}

	@ Start the cascade by encrypting the initial ctr[] value, then
	@ XOR of the output with that block.
	movs	r0, #10
	adr	r1, .RC_10_2
	subs	r4, r5, #32
	subs	r2, r4, #64
	movs	r3, r4
	bl	saturnin_block_encrypt
	movs	r0, r4
	movs	r1, r5
	bl	xor32_aligned

	@ Set the ptr field to 0
	movs	r0, #0
	str	r0, [r5, #64]

	pop	{r4, r5, r6, pc}
	.size	saturnin_aead_reset, .-saturnin_aead_reset

@ =======================================================================
@ void saturnin_aead_aad_inject(saturnin_aead_context *cc,
@                 const void *aad, size_t aad_len)
@
@   Inject some associated authenticated data.
@ =======================================================================
	.align	1
	.global	saturnin_aead_aad_inject
	.thumb
	.thumb_func
	.type	saturnin_aead_aad_inject, %function
saturnin_aead_aad_inject:
	push	{r0, r4, r5, r6, r7, lr}   @ r0 for alignment

	movs	r4, r0            @ context
	movs	r5, r1            @ aad[]
	movs    r6, r2            @ aad_len
	ldr	r7, [r4, #160]    @ ptr

	@ If the buffer is not empty, try to complete it with fresh AAD.
	@ If the provided AAD is not sufficient, simply exit.
	cbz	r7, .Laad_inject_step2
	adds	r0, #128
	adds	r0, r7            @ &cc->buf[ptr]
	movs	r2, #32
	subs	r2, r7
	cmp	r2, r6
	bhi	.Laad_inject_early_exit
	adds	r5, r2
	subs	r6, r2
	bl	memcpy
	movs	r0, r4
	adr	r1, .RC_10_2
	bl	do_cascade_aligned
	movs	r7, #0

.Laad_inject_step2:
.Laad_inject_loop:
	cmp	r6, #31
	bls	.Laad_inject_exit

	@ Copy the next block into buf[]. Source data is not necessarily
	@ aligned, so we use memcpy().
	adds	r0, r4, #128
	movs	r1, r5
	movs	r2, #32
	bl	memcpy

	@ Continue the cascade.
	movs	r0, r4
	adr	r1, .RC_10_2
	bl	do_cascade_aligned

	@ Consider the 32 bytes of AAD consumed.
	adds	r5, #32
	subs	r6, #32
	b	.Laad_inject_loop

.Laad_inject_exit:
	@ Copy the remaining of the data into buf[].
	adds	r0, r4, #128
	movs	r1, r5
	movs	r2, r6
	bl	memcpy
	str	r6, [r4, #160]
	pop	{r0, r4, r5, r6, r7, pc}

.Laad_inject_early_exit:
	movs	r2, r6
	adds	r7, r6
	bl	memcpy
	str	r7, [r4, #160]
	pop	{r0, r4, r5, r6, r7, pc}
	.size	saturnin_aead_aad_inject, .-saturnin_aead_aad_inject

@ =======================================================================
@ void pad_buffer(void *buf)
@
@     r0   pointer to the 32-byte buffer (must be followed by ptr)
@
@   r0, r1, r2 and r3 may be altered arbitrarily.
@
@   This rountine extract the ptr value at address r4+32 (i.e. immediately
@   after the buffer to pad); this is the length of the data in the
@   buffer.
@ =======================================================================
	.align	1
	.thumb
	.thumb_func
	.type	pad_buffer, %function
pad_buffer:
	ldr	r1, [r0, #32]    @ ptr
	adds	r0, r1
	movs	r2, #0x80
	strb	r2, [r0]
	adds	r0, #1
	movs	r2, #31
	subs	r2, r1
	movs	r1, #0
	b	memset           @ tail call to memset
	.size	pad_buffer, .-pad_buffer

@ =======================================================================
@ void saturnin_aead_flip(saturnin_aead_context *cc)
@
@   Finish processing of AAD, start things for encryption/decryption.
@ =======================================================================
	.align	1
	.global	saturnin_aead_flip
	.thumb
	.thumb_func
	.type	saturnin_aead_flip, %function
saturnin_aead_flip:
	push	{r4, lr}

	movs	r4, r0            @ context
	adds	r0, #128
	bl	pad_buffer

	@ Process the final block.
	movs	r0, r4
	adr	r1, .RC_10_3
	bl	do_cascade_aligned

	@ Clear the ptr field.
	movs	r0, #0
	str	r0, [r4, #160]
	pop	{r4, pc}
	.size	saturnin_aead_flip, .-saturnin_aead_flip

@ =======================================================================
@ aead_run_partial: internal routine, special call convention
@
@     r2   buf (pointer &cc->buf[ptr])
@     r5   encrypt (0 = decrypt, -1 = encrypt)
@     r6   data
@     r3   number of bytes to process (0 <= r3 <= 31)
@
@   r0 and r1 may be altered arbitrarily.
@   r2 is modified (new r2 is the sum of old r2 and old r3).
@   r6 is modified (new r6 is the sum of old r6 and old r3).
@   r3 is set to 0.
@
@   Encrypt (if r5 == -1) or decrypt (if r5 == 0) some data.
@   The encryption/decryption is done in place. The r2 parameter
@   points to some emplacement within the buf[] field of the context
@   (not necessarily at the start). All r3 bytes can fit in the
@   remaining of that buffer. This function performs only buffer
@   processing (XOR, keeping the ciphertext bytes), but does not
@   call the block encryption or the cascade.
@ =======================================================================
	.align	1
	.thumb
	.thumb_func
	.type	aead_run_partial, %function
aead_run_partial:
	cbz	r3, .Laead_run_partial_exit
.Laead_run_partial_loop:
	ldrb	r0, [r2]     @ CTR-stream byte
	ldrb	r1, [r6]     @ input byte
	eors	r1, r0       @ output byte
	bics	r0, r5
	eors	r0, r1       @ ciphertext byte
	strb	r1, [r6]
	strb	r0, [r2]
	adds	r2, #1
	adds	r6, #1
	subs	r3, #1
	bne	.Laead_run_partial_loop
.Laead_run_partial_exit:
	bx	lr
	.size	aead_run_partial, .-aead_run_partial

@ =======================================================================
@ aead_run_incr_ctr: internal routine, special call convention
@
@     r4   pointer &cc->buf[0]
@
@   r0, r1, r2 and r3 may be altered arbitrarily.
@
@   Increment the counter value (from ctr[]), and copy it to the buf[]
@   array.
@ =======================================================================
	.align	1
	.thumb
	.thumb_func
	.type	aead_run_incr_ctr, %function
aead_run_incr_ctr:
	movs	r0, r4
	subs	r0, #32              @ &cc->ctr[0]
	ldm	r0!, {r1, r2, r3}
	adds	r0, #20
	stm	r0!, {r1, r2, r3}
	subs	r0, #32
	ldm	r0!, {r1, r2, r3}
	adds	r0, #20
	stm	r0!, {r1, r2, r3}
	subs	r0, #32
	ldm	r0!, {r1, r2}
	rev	r1, r1
	rev	r2, r2
	movs	r3, #0
	adds	r2, #1
	adcs	r1, r3
	rev	r1, r1
	rev	r2, r2
	subs	r0, #8
	stm	r0!, {r1, r2}
	adds	r0, #24
	stm	r0!, {r1, r2}
	bx	lr
	.size	aead_run_incr_ctr, .-aead_run_incr_ctr

@ =======================================================================
@ void saturnin_aead_run(saturnin_aead_context *cc, int encrypt,
@                        void *data, size_t data_len)
@
@   Encrypt (if encrypt != 0) or decrypt (if encrypt == 0) some data.
@   The encryption/decryption is done in place.
@ =======================================================================
	.align	1
	.global	saturnin_aead_run
	.thumb
	.thumb_func
	.type	saturnin_aead_run, %function
saturnin_aead_run:
	@ If there is no data, then exit early.
	cbnz	r3, .Lsaturnin_aead_run_step1
	bx	lr

.Lsaturnin_aead_run_step1:
	push	{r0, r4, r5, r6, r7, lr}    @ r0 is for stack alignment

	@ Save arguments into preserved registers.
	movs	r4, r0
	movs	r5, r1
	movs	r6, r2
	movs	r7, r3

	@ Normalize the 'encrypt' flag (-1 or 0).
	@ We use the fact that x|-x has its high bit set iff x != 0.
	rsbs	r1, r1, #0    @ negate r1
	orrs	r5, r1
	asrs	r5, r5, #31

	@ Also normalize r4 to point to the buf[] field (this keeps
	@ offsets low, and thus allows the use of smaller encodings for
	@ ldr and str opcodes).
	adds	r4, #128

	@ If there is already a partial block, try to complete it.
	ldr	r2, [r4, #32]
	cbz	r2, .Lsaturnin_aead_run_step2

	@ Compute the number of bytes to inject in the first partial run.
	rsbs	r3, r2, #32
	cmp	r3, r7
	it hi
	movhi	r3, r7

	@ Pointer &cc->buf[ptr] -> into r2.
	adds	r2, r4

	@ The aead_run_partial call will consume r3 data bytes.
	subs	r7, r3

	@ Process the r3 next bytes.
	bl	aead_run_partial

	@ r2 now points to the new &cc->buf[ptr]. From this we can
	@ infer the new value of ptr. If this is less than 31, then
	@ we are finished; we just write back the new value, and exit.
	movs	r3, r2
	subs	r3, r4
	cmp	r3, #31
	bls	.Lsaturnin_aead_run_exit

	@ We made a complete block, so we must run the cascade on it.
	movs	r0, r4
	subs	r0, #128
	adr	r1, .RC_10_4
	bl	do_cascade_aligned

.Lsaturnin_aead_run_step2:
	@ At this point, the buffer is empty. We process all full
	@ blocks.
.Lsaturnin_aead_run_loop:
	@ Consume 32 bytes; if the result is negative, then there are not
	@ as many remaining bytes, and we go to step 3.
	subs	r7, #32
	bmi	.Lsaturnin_aead_run_step3

	@ Generate next ctr block and encrypt it. We use a 64-bit counter;
	@ as per the specification, the counter is big-endian.
	bl	aead_run_incr_ctr

	movs	r0, #10         @ 10 super-rounds
	adr	r1, .RC_10_1    @ round constants
	movs	r2, r4
	subs	r2, #128        @ keybuf[]
	movs	r3, r4          @ buf[]
	bl	saturnin_block_encrypt

	@ Read data block, update it, and also copy the ciphertext to
	@ the buf[] array. The data block may be unaligned.
	movs	r3, #28
.Lsaturnin_aead_run_loop2:
	ldr	r0, [r4, r3]     @ CTR-stream word
	ldr	r1, [r6, r3]     @ input word
	eors	r1, r0           @ output word
	bics	r0, r5
	eors	r0, r1           @ ciphertext word
	str	r1, [r6, r3]     @ store back output word
	str	r0, [r4, r3]     @ store ciphertext word in buf
	subs	r3, #4
	bpl	.Lsaturnin_aead_run_loop2

	@ Apply the cascade on the full ciphertext block.
	movs	r0, r4
	subs	r0, #128
	adr	r1, .RC_10_4
	bl	do_cascade_aligned

	@ Loop for next block.
	adds	r6, #32
	b	.Lsaturnin_aead_run_loop

.Lsaturnin_aead_run_step3:
	@ At that point, there is less than a full block, and the buffer
	@ is empty. The number of remaining bytes to process is r7+32.
	adds	r7, #32
	movs	r3, r7
	beq	.Lsaturnin_aead_run_exit

	@ Prepare next block of CTR-stream
	bl	aead_run_incr_ctr
	movs	r0, #10         @ 10 super-rounds
	adr	r1, .RC_10_1    @ round constants
	movs	r2, r4
	subs	r2, #128        @ keybuf[]
	movs	r3, r4          @ buf[]
	bl	saturnin_block_encrypt

	@ Process the remaining bytes.
	movs	r2, r4
	movs	r3, r7
	bl	aead_run_partial

	@ Set r3 to the new value of cc->ptr.
	movs	r3, r7

.Lsaturnin_aead_run_exit:
	@ We get here when we are finished, with just cc->ptr to update.
	@ The new value of ptr is in r3.
	str	r3, [r4, #32]
	pop	{r0, r4, r5, r6, r7, pc}
	.size	saturnin_aead_run, .-saturnin_aead_run

@ =======================================================================
@ void saturnin_aead_get_tag(saturnin_aead_context *cc,
@                            void *tag, size_t tag_len)
@
@   Finish computation of the authentication tag, and copy it into
@   the provided buffer. The caller ensures that tag_len is at
@   most 32.
@ =======================================================================
	.align	1
	.global	saturnin_aead_get_tag
	.thumb
	.thumb_func
	.type	saturnin_aead_get_tag, %function
saturnin_aead_get_tag:
	push	{r4, r5, r6, lr}

	movs	r4, r0           @ context
	movs	r5, r1           @ tag
	movs    r6, r2           @ tag_len

	@ Pad the current buffer contents.
	adds	r0, #128         @ &cc->buf[0]
	bl	pad_buffer

	@ Final Cascade block.
	movs	r0, r4
	adr	r1, .RC_10_5
	bl	do_cascade_aligned

	@ Copy the tag value.
	movs	r0, r5
	adds	r1, r4, #64
	movs	r2, r6
	bl	memcpy

	pop	{r4, r5, r6, pc}
	.size	saturnin_aead_get_tag, .-saturnin_aead_get_tag

@ =======================================================================
@ int saturnin_aead_check_tag(saturnin_aead_context *cc,
@                              const void *tag, size_t tag_len)
@
@   Finish computation of the authentication tag, and compare it
@   with the provided value; the caller ensures that tag_len is at
@   most 32. Returned value is 1 on exact match, 0 otherwise.
@ =======================================================================
	.align	1
	.global	saturnin_aead_check_tag
	.thumb
	.thumb_func
	.type	saturnin_aead_check_tag, %function
saturnin_aead_check_tag:
	push	{r4, r5, r6, lr}

	movs	r4, r0           @ context
	movs	r5, r1           @ tag
	movs    r6, r2           @ tag_len

	@ Compute tag value into a stack buffer.
	sub	sp, sp, #32
	mov	r1, sp
	bl	saturnin_aead_get_tag

	@ Do the comparison.
	movs	r2, #0
	mov	r3, sp
.Lsaturnin_aead_check_tag_loop:
	subs	r6, #1
	bmi	.Lsaturnin_aead_check_tag_step2
	ldrb	r0, [r3, r6]
	ldrb	r1, [r5, r6]
	eors	r0, r1
	orrs	r2, r0
	b	.Lsaturnin_aead_check_tag_loop
.Lsaturnin_aead_check_tag_step2:
	adds	r2, #0xFF
	lsrs	r2, r2, #8
	movs	r0, #1
	subs	r0, r2

	add	sp, sp, #32
	pop	{r4, r5, r6, pc}
	.size	saturnin_aead_check_tag, .-saturnin_aead_check_tag

@ =======================================================================
@ Saturnin-Hash.
@
@ The context is the following structure (declared in saturnin.h):
@
@   typedef struct {
@           uint8_t state[32];
@           uint8_t buf[32];
@           size_t ptr;
@   } saturnin_hash_context;
@
@ =======================================================================

@ =======================================================================
@ void saturnin_hash_init(saturnin_hash_context *hc)
@
@   Initialize a context for a new hash computation.
@ =======================================================================
	.align	1
	.global	saturnin_hash_init
	.thumb
	.thumb_func
	.type	saturnin_hash_init, %function
saturnin_hash_init:
	movs	r3, #0
	str	r3, [r0, #64]
	movs	r1, #0
	movs	r2, #32
	b	memset      @ tail call to memset
	.size	saturnin_hash_init, .-saturnin_hash_init

@ =======================================================================
@ void hash_process_block(const uint8_t *key, const uint32_t *rc,
@                         const uint8_t *in, const uint8_t *out)
@
@   Compute: in XOR E(key, in) -> out
@
@   out[] may be the same buffer as key[].
@   The in[] and out[] buffers MUST be 32-bit aligned.
@ =======================================================================
	.align	1
	.thumb
	.thumb_func
	.type	hash_process_block, %function
hash_process_block:
	push	{r4, r5, r6, lr}
	sub	sp, sp, #64

	@ Save parameters.
	movs	r4, r1
	movs	r5, r2
	movs	r6, r3

	@ Expand key into stack buffer.
	movs	r1, r0
	mov	r0, sp
	bl	saturnin_key_expand

	@ Copy the input block into the output block. We do it
	@ manually because this is more efficient than calling memcpy().
	ldm	r5!, {r0, r1, r2, r3}
	stm	r6!, {r0, r1, r2, r3}
	ldm	r5!, {r0, r1, r2, r3}
	stm	r6!, {r0, r1, r2, r3}
	subs	r5, #32
	subs	r6, #32

	@ Apply the block encryption.
	movs	r0, #16
	movs	r1, r4
	mov	r2, sp
	movs	r3, r6
	bl	saturnin_block_encrypt

	@ XOR input block into output block.
	movs	r0, r6
	movs	r1, r5
	bl	xor32_aligned

	add	sp, sp, #64
	pop	{r4, r5, r6, pc}
	.size	hash_process_block, .-hash_process_block

@ =======================================================================
@ void saturnin_hash_update(saturnin_hash_context *hc,
@                           const void *data, size_t data_len)
@
@   Inject more bytes in the context.
@ =======================================================================
	.align	1
	.global	saturnin_hash_update
	.thumb
	.thumb_func
	.type	saturnin_hash_update, %function
saturnin_hash_update:
	push	{r0, r4, r5, r6, r7, lr}    @ r0 is for stack alignment

	@ Save parameters.
	movs	r4, r0      @ context
	movs	r5, r1      @ data
	movs	r6, r2      @ data_len

	@ Read current ptr. If non-zero, then complete the current block.
	ldr	r7, [r4, #64]
	tst	r7, r7
	beq	.Lsaturnin_hash_update_step2

	@ Copy bytes into the buffer.
	adds	r0, #32
	adds	r0, r7      @ destination: &hc->buf[ptr]
	movs	r1, r5      @ source: data
	movs	r2, #32
	subs	r2, r7
	cmp	r2, r6
	it hi
	movhi	r2, r6      @ length: min(32-ptr, data_len)
	adds	r5, r2      @ consume data (pointer)
	subs	r6, r2      @ consume data (length)
	adds	r7, r2      @ new ptr value
	bl	memcpy

	@ If still not a full block, exit.
	cmp	r7, #31
	bls	.Lsaturnin_hash_update_exit

	@ Process the full block.
	movs	r0, r4           @ key: current state[]
	adr	r1, .RC_16_7     @ round constants
	movs	r2, r4
	adds	r2, #32          @ in: buf[]
	movs	r3, r4           @ out: state[]
	bl	hash_process_block

.Lsaturnin_hash_update_step2:
	@ Process full blocks.
.Lsaturnin_hash_update_loop:
	subs	r6, #32
	bmi	.Lsaturnin_hash_update_step3

	@ Copy next data block into buf[], in order to make it aligned.
	adds	r4, #32
	ldr	r0, [r5]
	ldr	r1, [r5, #4]
	ldr	r2, [r5, #8]
	ldr	r3, [r5, #12]
	stm	r4!, {r0, r1, r2, r3}
	ldr	r0, [r5, #16]
	ldr	r1, [r5, #20]
	ldr	r2, [r5, #24]
	ldr	r3, [r5, #28]
	stm	r4!, {r0, r1, r2, r3}
	subs	r4, #64
	adds	r5, #32

	@ Apply the block encryption.
	movs	r0, r4
	adr	r1, .RC_16_7     @ round constants
	movs	r2, r4
	adds	r2, #32          @ in: buf[]
	movs	r3, r4           @ out: state[]
	bl	hash_process_block

	@ Loop for next block.
	b	.Lsaturnin_hash_update_loop

.Lsaturnin_hash_update_step3:
	@ At this point, there are fewer remaining bytes than a full
	@ block; number of remaining bytes is r6+32. We simply buffer
	@ them.
	adds	r6, #32
	movs	r0, r4
	adds	r0, #32
	movs	r1, r5
	movs	r2, r6
	bl	memcpy
	movs	r7, r6

.Lsaturnin_hash_update_exit:
	str	r7, [r4, #64]    @ Store back ptr
	pop	{r0, r4, r5, r6, r7, pc}
	.size	saturnin_hash_update, .-saturnin_hash_update

@ =======================================================================
@ void saturnin_hash_out(const saturnin_hash_context *hc, void *out)
@
@   Complete hash computation.
@ =======================================================================
	.align	1
	.global	saturnin_hash_out
	.thumb
	.thumb_func
	.type	saturnin_hash_out, %function
saturnin_hash_out:
	push	{r4, r5, r6, r7, lr}
	sub	sp, sp, #68  @ two 32-byte stack buffers + alignment

	@ Save parameters.
	movs	r6, r0       @ context
	movs	r7, r1       @ out

	@ Copy the current contents of buf[] into the lower stack buffer.
	movs	r1, r6
	adds	r1, #32
	mov	r0, sp
	ldm	r1!, {r2, r3, r4, r5}
	stm	r0!, {r2, r3, r4, r5}
	ldm	r1!, {r2, r3, r4, r5}
	stm	r0!, {r2, r3, r4, r5}
	@ Copy also the ptr field (pad_buffer expects it just after the block).
	ldr	r2, [r1]
	str	r2, [r0]

	@ Pad current buffer. This also reads the ptr field (located
	@ just after the buffer).
	mov	r0, sp
	bl	pad_buffer

	@ Process the padded block.
	@ Output is into a stack buffer (we need an aligned buffer, and
	@ we cannot modify the context).
	movs	r0, r6           @ key: current state[]
	adr	r1, .RC_16_8     @ round constants
	mov	r2, sp           @ in: padded input
	mov	r3, sp
	adds	r3, #32          @ output: upper stack buffer
	bl	hash_process_block

	@ Copy the process block into the output.
	movs	r0, r7
	mov	r1, sp
	adds	r1, #32
	movs	r2, #32

	add	sp, sp, #68
	pop	{r4, r5, r6, r7, lr}
	b	memcpy           @ tail call
	.size	saturnin_hash_out, .-saturnin_hash_out