/******************************************************************************* * ARM assembly implementation of fixsliced SKINNY-128-384. * * For more details, see the paper at: https:// * * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * * @date May 2020 *******************************************************************************/ .syntax unified .thumb /******************************************************************************* * applies P^2 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p2: movw r1, #0xcc00 movt r1, #0xcc00 //r1 <- 0xcc00cc00 movw r10, #0xcc00 movt r10, #0x0033 //r10<- 0xcc000033 and r11, r1, r6, ror #14 bfi r11, r6, #16, #8 and r12, r6, #0xcc000000 orr r11, r11, r12, lsr #2 and r12, r10, r6 orr r11, r11, r12, lsr #8 and r12, r6, #0x00cc0000 orr r6, r11, r12, lsr #18 and r11, r1, r7, ror #14 bfi r11, r7, #16, #8 and r12, r7, #0xcc000000 orr r11, r11, r12, lsr #2 and r12, r10, r7 orr r11, r11, r12, lsr #8 and r12, r7, #0x00cc0000 orr r7, r11, r12, lsr #18 and r11, r1, r8, ror #14 bfi r11, r8, #16, #8 and r12, r8, #0xcc000000 orr r11, r11, r12, lsr #2 and r12, r10, r8 orr r11, r11, r12, lsr #8 and r12, r8, #0x00cc0000 orr r8, r11, r12, lsr #18 and r11, r1, r9, ror #14 bfi r11, r9, #16, #8 and r12, r9, #0xcc000000 orr r11, r11, r12, lsr #2 and r12, r10, r9 orr r11, r11, r12, lsr #8 and r12, r9, #0x00cc0000 orr r9, r11, r12, lsr #18 bx lr /******************************************************************************* * applies P^4 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p4: str.w r14, [sp] //store r14 on the stack movw r14, #0x00cc movt r14, #0xcc00 //r14<- 0xcc0000cc movw r12, #0xcc00 movt r12, #0x3300 //r12<- 0x3300cc00 movw r11, #0x00cc movt r11, #0x00cc //r11<- 0x00cc00cc and r10, r14, r6, ror #22 and r1, r12, r6, ror #16 orr r10, r10, r1 and r1, r6, r11 orr r10, r10, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r6, r6, r1 orr r6, r10, r6, ror #24 and r10, r14, r7, ror #22 and r1, r12, r7, ror #16 orr r10, r10, r1 and r1, r7, r11 orr r10, r10, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r7, r7, r1 orr r7, r10, r7, ror #24 and r10, r14, r8, ror #22 and r1, r12, r8, ror #16 orr r10, r10, r1 and r1, r8, r11 orr r10, r10, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r8, r8, r1 orr r8, r10, r8, ror #24 and r10, r14, r9, ror #22 ldr.w r14, [sp] //restore r14 and r12, r12, r9, ror #16 orr r10, r10, r12 and r12, r9, r11 orr r10, r10, r12, lsr #2 movw r12, #0xcc33 //r1 <- 0x0000cc33 and r9, r9, r12 orr r9, r10, r9, ror #24 bx lr /******************************************************************************* * applies P^6 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p6: movw r1, #0x3333 //r1 <- 0x00003333 movw r12, #0x00cc movt r12, #0x3300 //r12<- 0x330000cc and r10, r6, r1, ror #8 // --- permute r6 6 times and r11, r12, r6, ror #24 orr r11, r11, r10, ror #6 and r10, r1, r6, ror #10 orr r11, r11, r10 and r10, r6, #0x000000cc orr r11, r11, r10, lsl #14 and r10, r6, #0x00003300 orr r6, r11, r10, lsl #2 // permute r6 6 times --- and r10, r7, r1, ror #8 // --- permute r7 6 times and r11, r12, r7, ror #24 orr r11, r11, r10, ror #6 and r10, r1, r7, ror #10 orr r11, r11, r10 and r10, r7, #0x000000cc orr r11, r11, r10, lsl #14 and r10, r7, #0x00003300 orr r7, r11, r10, lsl #2 // permute r7 6 times --- and r10, r8, r1, ror #8 // --- permute r8 6 times and r11, r12, r8, ror #24 orr r11, r11, r10, ror #6 and r10, r1, r8, ror #10 orr r11, r11, r10 and r10, r8, #0x000000cc orr r11, r11, r10, lsl #14 and r10, r8, #0x00003300 orr r8, r11, r10, lsl #2 // permute r8 6 times --- and r10, r9, r1, ror #8 // --- permute r9 6 times and r11, r12, r9, ror #24 orr r11, r11, r10, ror #6 and r10, r1, r9, ror #10 orr r11, r11, r10 and r10, r9, #0x000000cc orr r11, r11, r10, lsl #14 and r10, r9, #0x00003300 // permute r9 6 times --- orr r9, r11, r10, lsl #2 bx lr /******************************************************************************* * applies P^8 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p8: movw r12, #0x3333 //r12<- 0x00003333 movw r1, #0x0000 movt r1, #0x33cc //r1 <- 0x33cc0000 and r10, r6, r1 // --- permute r6 8 times and r11, r1, r6, ror #8 orr r11, r11, r10, ror #24 and r10, r6, r12, lsl #2 orr r11, r11, r10, ror #26 and r10, r6, r12, lsl #8 orr r6, r11, r10, lsr #6 // permute r6 8 times --- and r10, r7, r1 // --- permute r7 8 times and r11, r1, r7, ror #8 orr r11, r11, r10, ror #24 and r10, r7, r12, lsl #2 orr r11, r11, r10, ror #26 and r10, r7, r12, lsl #8 orr r7, r11, r10, lsr #6 // permute r7 8 times --- and r10, r8, r1 // --- permute r8 8 times and r11, r1, r8, ror #8 orr r11, r11, r10, ror #24 and r10, r8, r12, lsl #2 orr r11, r11, r10, ror #26 and r10, r8, r12, lsl #8 orr r8, r11, r10, lsr #6 // permute r8 8 times --- and r10, r9, r1 // --- permute r9 8 times and r11, r1, r9, ror #8 orr r11, r11, r10, ror #24 and r10, r9, r12, lsl #2 orr r11, r11, r10, ror #26 and r10, r9, r12, lsl #8 orr r9, r11, r10, lsr #6 // permute r9 8 times --- bx lr /******************************************************************************* * applies P^10 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p10: movw r12, #0x0033 movt r12, #0x3300 //r12<- 0x33000033 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r10, r6, r1, ror #8 // --- permute r6 10 times and r11, r12, r6, ror #26 orr r11, r11, r10, ror #8 and r10, r6, r12, ror #24 orr r11, r11, r10, ror #22 and r10, r6, #0x00330000 orr r11, r11, r10, lsr #14 and r10, r6, #0x0000cc00 orr r6, r11, r10, lsr #2 // permute r6 10 times --- and r10, r7, r1, ror #8 // --- permute r6 10 times and r11, r12, r7, ror #26 orr r11, r11, r10, ror #8 and r10, r7, r12, ror #24 orr r11, r11, r10, ror #22 and r10, r7, #0x00330000 orr r11, r11, r10, lsr #14 and r10, r7, #0x0000cc00 orr r7, r11, r10, lsr #2 // permute r6 10 times --- and r10, r8, r1, ror #8 // --- permute r6 10 times and r11, r12, r8, ror #26 orr r11, r11, r10, ror #8 and r10, r8, r12, ror #24 orr r11, r11, r10, ror #22 and r10, r8, #0x00330000 orr r11, r11, r10, lsr #14 and r10, r8, #0x0000cc00 orr r8, r11, r10, lsr #2 // permute r6 10 times --- and r10, r9, r1, ror #8 // --- permute r6 10 times and r11, r12, r9, ror #26 orr r11, r11, r10, ror #8 and r10, r9, r12, ror #24 orr r11, r11, r10, ror #22 and r10, r9, #0x00330000 orr r11, r11, r10, lsr #14 and r10, r9, #0x0000cc00 orr r9, r11, r10, lsr #2 // permute r6 10 times --- bx lr /******************************************************************************* * applies P^12 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p12: str.w r14, [sp] //store r14 on the stack movw r14, #0xcc33 //r14<- 0x0000cc33 movw r12, #0x00cc movt r12, #0x00cc //r12<- 0x00cc00cc movw r1, #0x3300 movt r1, #0xcc00 //r1 <- 0xcc003300 and r10, r14, r6, ror #8 // --- permute r6 12 times and r11, r12, r6, ror #30 orr r11, r11, r10 and r10, r1, r6, ror #16 orr r11, r11, r10 movw r10, #0xcccc //r10<- 0x0000cccc and r10, r6, r10, ror #8 orr r6, r11, r10, ror #10 // permute r6 12 times --- and r10, r14, r7, ror #8 // --- permute r7 12 times and r11, r12, r7, ror #30 orr r11, r11, r10 and r10, r1, r7, ror #16 orr r11, r11, r10 movw r10, #0xcccc //r10<- 0x0000cccc and r10, r7, r10, ror #8 orr r7, r11, r10, ror #10 // permute r7 12 times --- and r10, r14, r8, ror #8 // --- permute r8 12 times and r11, r12, r8, ror #30 orr r11, r11, r10 and r10, r1, r8, ror #16 orr r11, r11, r10 movw r10, #0xcccc //r10<- 0x0000cccc and r10, r8, r10, ror #8 orr r8, r11, r10, ror #10 // permute r8 12 times --- and r10, r14, r9, ror #8 // --- permute r9 12 times and r11, r12, r9, ror #30 orr r11, r11, r10 and r10, r1, r9, ror #16 ldr.w r14, [sp] orr r11, r11, r10 movw r10, #0xcccc //r10<- 0x0000cccc and r10, r9, r10, ror #8 orr r9, r11, r10, ror #10 // permute r9 12 times --- bx lr /******************************************************************************* * applies P^14 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p14: movw r1, #0xcc00 movt r1, #0x0033 //r1 <- 0x0033cc00 movw r12, #0xcc00 movt r12, #0xcc00 //r12<- 0x33003300 and r10, r1, r6, ror #24 // --- permute r6 14 times and r11, r6, #0x00000033 orr r11, r10, r11, ror #14 and r10, r6, #0x33000000 orr r11, r11, r10, ror #30 and r10, r6, #0x00ff0000 orr r11, r11, r10, ror #16 and r10, r6, r12 orr r6, r11, r10, ror #18 // permute r6 14 times --- and r10, r1, r7, ror #24 // --- permute r7 14 times and r11, r7, #0x00000033 orr r11, r10, r11, ror #14 and r10, r7, #0x33000000 orr r11, r11, r10, ror #30 and r10, r7, #0x00ff0000 orr r11, r11, r10, ror #16 and r10, r7, r12 orr r7, r11, r10, ror #18 // permute r7 14 times --- and r10, r1, r8, ror #24 // --- permute r8 14 times and r11, r8, #0x00000033 orr r11, r10, r11, ror #14 and r10, r8, #0x33000000 orr r11, r11, r10, ror #30 and r10, r8, #0x00ff0000 orr r11, r11, r10, ror #16 and r10, r8, r12 orr r8, r11, r10, ror #18 // permute r8 14 times --- and r10, r1, r9, ror #24 // --- permute r9 14 times and r11, r9, #0x00000033 orr r11, r10, r11, ror #14 and r10, r9, #0x33000000 orr r11, r11, r10, ror #30 and r10, r9, #0x00ff0000 orr r11, r11, r10, ror #16 and r10, r9, r12 orr r9, r11, r10, ror #18 // permute r9 14 times --- bx lr .align 2 packing: eor r12, r2, r2, lsr #3 and r12, r12, r10 eor r2, r2, r12 eor r2, r2, r12, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3) eor r12, r3, r3, lsr #3 and r12, r12, r10 eor r3, r3, r12 eor r3, r3, r12, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3) eor r12, r4, r4, lsr #3 and r12, r12, r10 eor r4, r4, r12 eor r4, r4, r12, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3) eor r12, r5, r5, lsr #3 and r12, r12, r10 eor r5, r5, r12 eor r5, r5, r12, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3) eor r12, r2, r4, lsr #2 and r12, r12, r11 eor r2, r2, r12 eor r4, r4, r12, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2) eor r12, r2, r3, lsr #4 and r12, r12, r11, lsr #2 eor r2, r2, r12 eor r3, r3, r12, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4) eor r12, r2, r5, lsr #6 and r12, r12, r11, lsr #4 eor r2, r2, r12 eor r5, r5, r12, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6) eor r12, r4, r3, lsr #2 and r12, r12, r11, lsr #2 eor r4, r4, r12 eor r3, r3, r12, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2) eor r12, r4, r5, lsr #4 and r12, r12, r11, lsr #4 eor r4, r4, r12 eor r5, r5, r12, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4) eor r12, r3, r5, lsr #2 and r12, r12, r11, lsr #4 eor r3, r3, r12 eor r5, r5, r12, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2) bx lr .align 2 unpacking: movw r6, #0x0a0a movt r6, #0x0a0a //r6 <- 0x0a0a0a0a eor r10, r3, r5, lsr #2 and r10, r10, r7, lsr #4 eor r3, r3, r10 eor r5, r5, r10, lsl #2 //SWAPMOVE(r5, r3, 0x03030303, 2) eor r10, r4, r5, lsr #4 and r10, r10, r7, lsr #4 eor r4, r4, r10 eor r5, r5, r10, lsl #4 //SWAPMOVE(r5, r4, 0x03030303, 4) eor r10, r4, r3, lsr #2 and r10, r10, r7, lsr #2 eor r4, r4, r10 eor r3, r3, r10, lsl #2 //SWAPMOVE(r3, r4, 0x0c0c0c0c, 2) eor r10, r2, r5, lsr #6 and r10, r10, r7, lsr #4 eor r2, r2, r10 eor r5, r5, r10, lsl #6 //SWAPMOVE(r5, r2, 0x03030303, 6) eor r10, r2, r3, lsr #4 and r10, r10, r7, lsr #2 eor r2, r2, r10 eor r3, r3, r10, lsl #4 //SWAPMOVE(r3, r2, 0x0c0c0c0c, 4) eor r10, r2, r4, lsr #2 and r10, r10, r7 eor r2, r2, r10 eor r4, r4, r10, lsl #2 //SWAPMOVE(r4, r2, 0x30303030, 2) eor r10, r5, r5, lsr #3 and r10, r10, r6 eor r5, r5, r10 eor r5, r5, r10, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3) eor r10, r4, r4, lsr #3 and r10, r10, r6 eor r4, r4, r10 eor r4, r4, r10, lsl #3 //SWAPMOVE(r4, r4, 0x0a0a0a0a, 3) eor r10, r3, r3, lsr #3 and r10, r10, r6 eor r3, r3, r10 eor r3, r3, r10, lsl #3 //SWAPMOVE(r3, r3, 0x0a0a0a0a, 3) eor r10, r2, r2, lsr #3 and r10, r10, r6 eor r2, r2, r10 eor r2, r2, r10, lsl #3 //SWAPMOVE(r2, r2, 0x0a0a0a0a, 3) bx lr /****************************************************************************** * Compute LFSR2(TK2) ^ LFSR3(TK3) for all rounds. * Performing both at the same time allows to save some memory accesses. ******************************************************************************/ @ void tkschedule_lfsr(u32* tk, const u8* tk2, const u8* tk3, const int rounds) .global tkschedule_lfsr .type tkschedule_lfsr,%function .align 2 tkschedule_lfsr: push {r0-r12, r14} ldr.w r3, [r1, #8] //load tk2 (3rd word) ldr.w r4, [r1, #4] //load tk2 (2nd word) ldr.w r5, [r1, #12] //load tk2 (4th word) ldr.w r12, [r1] //load tk2 (1st word) mov r1, r2 //move tk3 address in r1 mov r2, r12 //move 1st tk2 word in r2 movw r10, #0x0a0a movt r10, #0x0a0a //r10 <- 0x0a0a0a0a movw r11, #0x3030 movt r11, #0x3030 //r7 <- 0x30303030 bl packing //pack tk2 mov r6, r2 //move tk2 from r2-r5 to r6-r9 mov r7, r3 //move tk2 from r2-r5 to r6-r9 mov r8, r4 //move tk2 from r2-r5 to r6-r9 mov r9, r5 //move tk2 from r2-r5 to r6-r9 ldr.w r3, [r1, #8] //load tk3 (3rd word) ldr.w r4, [r1, #4] //load tk3 (2nd word) ldr.w r5, [r1, #12] //load tk3 (4th) word) ldr.w r2, [r1] //load tk3 (1st) word) bl packing //pack tk3 eor r10, r10, r10, lsl #4 //r10<- 0xaaaaaaaa ldr.w r1, [sp, #12] //load loop counter in r1 eor r11, r2, r6 //tk2 ^ tk3 (1st word) eor r12, r3, r7 //tk2 ^ tk3 (2nd word) strd r11, r12, [r0], #8 //store in tk eor r11, r4, r8 //tk2 ^ tk3 (3rd word) eor r12, r5, r9 //tk2 ^ tk3 (4th word) strd r11, r12, [r0], #8 //store in tk loop: and r12, r8, r10 // --- apply LFSR2 to tk2 eor r12, r12, r6 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r6, r14, r12, lsr #1 // apply LFSR2 to tk2 --- and r12, r3, r10 // --- apply LFSR3 to tk3 eor r12, r5, r12, lsr #1 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r5, r14, r12, lsr #1 // apply LFSR3 to tk3 --- eor r11, r5, r7 //tk2 ^ tk3 (1st word) eor r12, r2, r8 //tk2 ^ tk3 (2nd word) strd r11, r12, [r0], #8 //store in tk eor r11, r3, r9 //tk2 ^ tk3 (3rd word) eor r12, r4, r6 //tk2 ^ tk3 (4th word) strd r11, r12, [r0], #24 //store in tk and r12, r9, r10 // --- apply LFSR2 to tk2 eor r12, r12, r7 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r7, r14, r12, lsr #1 // apply LFSR2 to tk2 --- and r12, r2, r10 // --- apply LFSR3 to tk3 eor r12, r4, r12, lsr #1 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r4, r14, r12, lsr #1 // apply LFSR3 to tk3 --- eor r11, r4, r8 //tk2 ^ tk3 (1st word) eor r12, r5, r9 //tk2 ^ tk3 (2nd word) strd r11, r12, [r0], #8 //store in tk eor r11, r2, r6 //tk2 ^ tk3 (3rd word) eor r12, r3, r7 //tk2 ^ tk3 (4th word) strd r11, r12, [r0], #24 //store in tk and r12, r6, r10 // --- apply LFSR2 to tk2 eor r12, r12, r8 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r8, r14, r12, lsr #1 // apply LFSR2 to tk2 --- and r12, r5, r10 // --- apply LFSR3 to tk3 eor r12, r3, r12, lsr #1 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r3, r14, r12, lsr #1 // apply LFSR3 to tk3 --- eor r11, r3, r9 //tk2 ^ tk3 (1st word) eor r12, r4, r6 //tk2 ^ tk3 (2nd word) strd r11, r12, [r0], #8 //store in tk eor r11, r5, r7 //tk2 ^ tk3 (3rd word) eor r12, r2, r8 //tk2 ^ tk3 (4th word) strd r11, r12, [r0], #24 //store in tk and r12, r7, r10 // --- apply LFSR2 to tk2 eor r12, r12, r9 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r9, r14, r12, lsr #1 // apply LFSR2 to tk2 --- and r12, r4, r10 // --- apply LFSR3 to tk3 eor r12, r2, r12, lsr #1 and r14, r10, r12, lsl #1 and r12, r12, r10 orr r2, r14, r12, lsr #1 // apply LFSR3 to tk3 --- eor r11, r2, r6 //tk2 ^ tk3 (1st word) eor r12, r3, r7 //tk2 ^ tk3 (2nd word) strd r11, r12, [r0], #8 //store in tk eor r11, r4, r8 //tk2 ^ tk3 (3rd word) eor r12, r5, r9 //tk2 ^ tk3 (4th word) strd r11, r12, [r0], #24 //store in tk subs.w r1, r1, #8 //decrease loop counter by 8 bne loop pop {r0-r12, r14} bx lr /****************************************************************************** * Applies the permutation P and add the round constants to all round tweakeys. ******************************************************************************/ @ void tkschedule_perm(u32* tk) .global tkschedule_perm .type tkschedule_perm,%function .align 2 tkschedule_perm: push {r0-r12, lr} sub.w sp, #4 //to store r14 in subroutines ldm r0, {r6-r9} //load tk movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r6, r6, r10 //tk &= 0xf0f0f0f0 (1st word) and r7, r7, r10 //tk &= 0xf0f0f0f0 (2nd word) and r8, r8, r10 //tk &= 0xf0f0f0f0 (3rd word) and r9, r9, r10 //tk &= 0xf0f0f0f0 (4th word) eor r8, r8, #0x00000004 //add rconst eor r9, r9, #0x00000040 //add rconst mvn r9, r9 //to remove a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 1st round strd r6, r7, [r0], #8 //store 2nd half tk for 1st round ldm r0, {r6-r9} //load tk bl p2 //apply the permutation twice movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #26 //ror and mask to match fixslicing and r12, r10, r7, ror #26 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 2nd round and r11, r10, r8, ror #26 //ror and mask to match fixslicing and r12, r10, r9, ror #26 //ror and mask to match fixslicing eor r11, r11, #0x10000000 //add rconst eor r11, r11, #0x00000100 //add rconst eor r12, r12, #0x00000100 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #12 and r11, r10, r7, ror #28 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #12 and r11, r10, r8, ror #28 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #12 and r11, r10, r9, ror #28 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #12 //ror and masks to match fixslicing --- eor r7, r7, #0x04000000 //add rconst eor r8, r8, #0x44000000 //add rconst eor r9, r9, #0x04000000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 3rd round strd r6, r7, [r0], #8 //store 2nd half tk for 3rd round ldm r0, {r6-r9} //load tk bl p4 //apply the permutation 4 times movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6, ror #16 //ror and mask to match fixslicing and r12, r10, r7, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00400000 //add rconst eor r12, r12, #0x00400000 //add rconst strd r11, r12, [r0, #24] //store 2nd half tk for 5th round and r11, r10, r8, ror #16 //ror and mask to match fixslicing and r12, r10, r9, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00440000 //add rconst eor r12, r12, #0x00500000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 5th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #6 and r11, r10, r7, ror #14 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #6 and r11, r10, r8, ror #14 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #6 and r11, r10, r9, ror #14 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #6 //ror and masks to match fixslicing --- eor r6, r6, #0x00100000 //add rconst eor r7, r7, #0x00100000 //add rconst eor r8, r8, #0x00100000 //add rconst eor r8, r8, #0x00000001 //add rconst eor r9, r9, #0x00100000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 4th round strd r8, r9, [r0], #24 //store 2nd half tk for 4th round ldm r0, {r6-r9} //load tk bl p6 //apply the permutation 6 times movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #10 //ror and mask to match fixslicing and r12, r10, r7, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x01000000 //add rconst eor r12, r12, #0x01000000 //add rconst strd r11, r12, [r0], #8 //store 1st half tk for 6th round and r11, r10, r8, ror #10 //ror and mask to match fixslicing and r12, r10, r9, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x01400000 //add rconst eor r11, r11, #0x00001000 //add rconst eor r12, r12, #0x00400000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 6th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #28 and r11, r10, r7, ror #12 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #28 and r11, r10, r8, ror #12 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #28 and r11, r10, r9, ror #12 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #28 //ror and masks to match fixslicing --- eor r6, r6, #0x00000400 //add rconst eor r7, r7, #0x00000400 //add rconst eor r8, r8, #0x01000000 //add rconst eor r8, r8, #0x00004000 //add rconst eor r9, r9, #0x01000000 //add rconst eor r9, r9, #0x00000400 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 7th round strd r6, r7, [r0], #8 //store 2nd half tk for 7th round ldm r0, {r6-r9} //load tk bl p8 //apply the permutation 8 times movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6 //ror and mask to match fixslicing and r12, r10, r7 //ror and mask to match fixslicing eor r12, r12, #0x00000040 //add rconst strd r11, r12, [r0, #24] //store 2nd half tk for 9th round and r11, r10, r8 //ror and mask to match fixslicing and r12, r10, r9 //ror and mask to match fixslicing eor r11, r11, #0x00000054 //add rconst eor r12, r12, #0x00000050 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 9th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #22 and r11, r10, r7, ror #30 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #22 and r11, r10, r8, ror #30 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #22 and r11, r10, r9, ror #30 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #22 //ror and masks to match fixslicing --- eor r6 ,r6, #0x00000010 eor r8, r8, #0x00010000 eor r8, r8, #0x00000410 eor r9, r9, #0x00000410 mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 8th round strd r8, r9, [r0], #24 //store 2nd half tk for 8th round ldm r0, {r6-r9} //load tk bl p10 //apply the permutation 10 times movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #26 //ror and mask to match fixslicing and r12, r10, r7, ror #26 //ror and mask to match fixslicing eor r11, r11, #0x00000100 //add rconst eor r12, r12, #0x00000100 //add rconst strd r11, r12, [r0], #8 //store 1st half tk for 10th round and r11, r10, r8, ror #26 //ror and mask to match fixslicing and r12, r10, r9, ror #26 //ror and mask to match fixslicing eor r11, r11, #0x10000000 //add rconst eor r11, r11, #0x00000140 //add rconst eor r12, r12, #0x00000100 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 10th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #12 and r11, r10, r7, ror #28 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #12 and r11, r10, r8, ror #28 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #12 and r11, r10, r9, ror #28 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #12 //ror and masks to match fixslicing --- eor r6, r6, #0x04000000 //add rconst eor r7, r7, #0x04000000 //add rconst eor r8, r8, #0x44000000 //add rconst eor r9, r9, #0x00000100 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 11th round strd r6, r7, [r0], #8 //store 2nd half tk for 11th round ldm r0, {r6-r9} //load tk bl p12 //apply the permutation 4 times movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6, ror #16 //ror and mask to match fixslicing and r12, r10, r7, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00400000 //add rconst strd r11, r12, [r0, #24] //store 2nd half tk for 13th round and r11, r10, r8, ror #16 //ror and mask to match fixslicing and r12, r10, r9, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00140000 //add rconst eor r12, r12, #0x00500000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 13th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #6 and r11, r10, r7, ror #14 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #6 and r11, r10, r8, ror #14 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #6 and r11, r10, r9, ror #14 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #6 //ror and masks to match fixslicing --- eor r6, r6, #0x00100000 //add rconst eor r7, r7, #0x00100000 //add rconst eor r8, r8, #0x04000000 //add rconst eor r8, r8, #0x00000001 //add rconst eor r9, r9, #0x04000000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 12th round strd r8, r9, [r0], #24 //store 2nd half tk for 12th round ldm r0, {r6-r9} //load tk bl p14 //apply the permutation 6 times movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #10 //ror and mask to match fixslicing and r12, r10, r7, ror #10 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 14th round and r11, r10, r8, ror #10 //ror and mask to match fixslicing and r12, r10, r9, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x01400000 //add rconst eor r11, r11, #0x00001000 //add rconst eor r12, r12, #0x01400000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 14th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #28 and r11, r10, r7, ror #12 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #28 and r11, r10, r8, ror #12 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #28 and r11, r10, r9, ror #12 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #28 //ror and masks to match fixslicing --- eor r7, r7, #0x00000400 //add rconst eor r8, r8, #0x01000000 //add rconst eor r8, r8, #0x00004400 //add rconst eor r9, r9, #0x00000400 //add const mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 15th round strd r6, r7, [r0], #8 //store 2nd half tk for 15th round ldm r0, {r6-r9} //load tk movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6 //ror and mask to match fixslicing and r12, r10, r7 //ror and mask to match fixslicing eor r11, r11, #0x00000040 //add rconst eor r12, r12, #0x00000040 //add rconst strd r11, r12, [r0, #24] //store 2nd half tk for 17th round and r11, r10, r8 //ror and mask to match fixslicing and r12, r10, r9 //ror and mask to match fixslicing eor r11, r11, #0x00000004 //add rconst eor r12, r12, #0x00000050 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 17th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #22 and r11, r10, r7, ror #30 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #22 and r11, r10, r8, ror #30 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #22 and r11, r10, r9, ror #30 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #22 //ror and masks to match fixslicing --- eor r6 ,r6, #0x00000010 eor r7 ,r7, #0x00000010 eor r8, r8, #0x00000010 eor r8, r8, #0x00010000 mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 16th round strd r8, r9, [r0], #24 //store 2nd half tk for 16th round ldm r0, {r6-r9} //load tk bl p2 //apply the permutation twice movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #26 //ror and mask to match fixslicing and r12, r10, r7, ror #26 //ror and mask to match fixslicing eor r11, r11, #0x00000100 //add rconst strd r11, r12, [r0], #8 //store 1st half tk for 18th round and r11, r10, r8, ror #26 //ror and mask to match fixslicing and r12, r10, r9, ror #26 //ror and mask to match fixslicing eor r11, r11, #0x10000000 //add rconst eor r11, r11, #0x00000140 //add rconst eor r12, r12, #0x00000040 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 18th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #12 and r11, r10, r7, ror #28 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #12 and r11, r10, r8, ror #28 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #12 and r11, r10, r9, ror #28 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #12 //ror and masks to match fixslicing --- eor r7, r7, #0x04000000 //add rconst eor r8, r8, #0x40000000 //add rconst eor r8, r8, #0x00000100 //add rconst eor r9, r9, #0x04000000 //add rconst eor r9, r9, #0x00000100 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 19th round strd r6, r7, [r0], #8 //store 2nd half tk for 19th round ldm r0, {r6-r9} //load tk bl p4 //apply the permutation 4 times movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6, ror #16 //ror and mask to match fixslicing and r12, r10, r7, ror #16 //ror and mask to match fixslicing eor r12, r12, #0x00400000 //add rconst strd r11, r12, [r0, #24] //store 2nd half tk for 21th round and r11, r10, r8, ror #16 //ror and mask to match fixslicing and r12, r10, r9, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00440000 //add rconst eor r12, r12, #0x00100000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 21th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #6 and r11, r10, r7, ror #14 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #6 and r11, r10, r8, ror #14 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #6 and r11, r10, r9, ror #14 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #6 //ror and masks to match fixslicing --- eor r6, r6, #0x00100000 //add rconst eor r8, r8, #0x04100000 //add rconst eor r8, r8, #0x00000001 //add rconst eor r9, r9, #0x00100000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 20th round strd r8, r9, [r0], #24 //store 2nd half tk for 20th round ldm r0, {r6-r9} //load tk bl p6 //apply the permutation 6 times movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #10 //ror and mask to match fixslicing and r12, r10, r7, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x01000000 //add rconst eor r12, r12, #0x01000000 //add rconst strd r11, r12, [r0], #8 //store 1st half tk for 22th round and r11, r10, r8, ror #10 //ror and mask to match fixslicing and r12, r10, r9, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x00400000 //add rconst eor r11, r11, #0x00001000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 22th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #28 and r11, r10, r7, ror #12 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #28 and r11, r10, r8, ror #12 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #28 and r11, r10, r9, ror #12 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #28 //ror and masks to match fixslicing --- eor r6, r6, #0x00000400 //add rconst eor r8, r8, #0x00004000 //add rconst eor r9, r9, #0x01000000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 23th round strd r6, r7, [r0], #8 //store 2nd half tk for 23th round ldm r0, {r6-r9} //load tk bl p8 //apply the permutation 8 times movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6 //ror and mask to match fixslicing and r12, r10, r7 //ror and mask to match fixslicing strd r11, r12, [r0, #24] //store 2nd half tk for 25th round and r11, r10, r8 //ror and mask to match fixslicing and r12, r10, r9 //ror and mask to match fixslicing eor r11, r11, #0x00000014 //add rconst eor r12, r12, #0x00000040 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 25th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #22 and r11, r10, r7, ror #30 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #22 and r11, r10, r8, ror #30 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #22 and r11, r10, r9, ror #30 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #22 //ror and masks to match fixslicing --- eor r8, r8, #0x00010400 eor r9, r9, #0x00000400 mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 24th round strd r8, r9, [r0], #24 //store 2nd half tk for 24th round ldm r0, {r6-r9} //load tk bl p10 //apply the permutation 10 times movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #26 //ror and mask to match fixslicing and r12, r10, r7, ror #26 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 26th round and r11, r10, r8, ror #26 //ror and mask to match fixslicing and r12, r10, r9, ror #26 //ror and mask to match fixslicing eor r11, r11, #0x10000000 //add rconst eor r11, r11, #0x00000100 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 26th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #12 and r11, r10, r7, ror #28 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #12 and r11, r10, r8, ror #28 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #12 and r11, r10, r9, ror #28 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #12 //ror and masks to match fixslicing --- eor r7, r7, #0x04000000 //add rconst eor r8, r8, #0x40000000 //add rconst eor r9, r9, #0x04000000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 27th round strd r6, r7, [r0], #8 //store 2nd half tk for 27th round ldm r0, {r6-r9} //load tk bl p12 //apply the permutation 4 times movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6, ror #16 //ror and mask to match fixslicing and r12, r10, r7, ror #16 //ror and mask to match fixslicing eor r12, r12, #0x00400000 //add rconst strd r11, r12, [r0, #24] //store 2nd half tk for 29th round and r11, r10, r8, ror #16 //ror and mask to match fixslicing and r12, r10, r9, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00440000 //add rconst eor r12, r12, #0x00500000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 29th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #6 and r11, r10, r7, ror #14 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #6 and r11, r10, r8, ror #14 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #6 and r11, r10, r9, ror #14 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #6 //ror and masks to match fixslicing --- eor r6, r6, #0x00100000 //add rconst eor r8, r8, #0x00100000 //add rconst eor r8, r8, #0x00000001 //add rconst eor r9, r9, #0x00100000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 28th round strd r8, r9, [r0], #24 //store 2nd half tk for 28th round ldm r0, {r6-r9} //load tk bl p14 //apply the permutation 6 times movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #10 //ror and mask to match fixslicing and r12, r10, r7, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x01000000 //add rconst eor r12, r12, #0x01000000 //add rconst strd r11, r12, [r0], #8 //store 1st half tk for 30th round and r11, r10, r8, ror #10 //ror and mask to match fixslicing and r12, r10, r9, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x01400000 //add rconst eor r11, r11, #0x00001000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 30th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #28 and r11, r10, r7, ror #12 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #28 and r11, r10, r8, ror #12 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #28 and r11, r10, r9, ror #12 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #28 //ror and masks to match fixslicing --- eor r6, r6, #0x00000400 //add rconst eor r7, r7, #0x00000400 //add rconst eor r8, r8, #0x00004000 //add rconst eor r9, r9, #0x01000000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 31th round strd r6, r7, [r0], #8 //store 2nd half tk for 31th round ldm r0, {r6-r9} //load tk movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6 //ror and mask to match fixslicing and r12, r10, r7 //ror and mask to match fixslicing strd r11, r12, [r0, #24] //store 2nd half tk for 33th round and r11, r10, r8 //ror and mask to match fixslicing and r12, r10, r9 //ror and mask to match fixslicing eor r11, r11, #0x00000014 //add rconst eor r12, r12, #0x00000050 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 33th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #22 and r11, r10, r7, ror #30 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #22 and r11, r10, r8, ror #30 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #22 and r11, r10, r9, ror #30 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #22 //ror and masks to match fixslicing --- eor r6 ,r6, #0x00000010 eor r8, r8, #0x00010400 eor r9, r9, #0x00000400 mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 32th round strd r8, r9, [r0], #24 //store 2nd half tk for 32th round ldm r0, {r6-r9} //load tk bl p2 //apply the permutation twice movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #26 //ror and mask to match fixslicing and r12, r10, r7, ror #26 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 34th round and r11, r10, r8, ror #26 //ror and mask to match fixslicing and r12, r10, r9, ror #26 //ror and mask to match fixslicing eor r11, r11, #0x10000000 //add rconst eor r11, r11, #0x00000140 //add rconst eor r12, r12, #0x00000100 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 34th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #28 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #12 and r11, r10, r7, ror #28 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #12 and r11, r10, r8, ror #28 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #12 and r11, r10, r9, ror #28 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #12 //ror and masks to match fixslicing --- eor r7, r7, #0x04000000 //add rconst eor r8, r8, #0x44000000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 35th round strd r6, r7, [r0], #8 //store 2nd half tk for 35th round ldm r0, {r6-r9} //load tk bl p4 //apply the permutation 4 times movw r10, #0xf0f0 movt r10, #0xf0f0 //r10<- 0xf0f0f0f0 and r11, r10, r6, ror #16 //ror and mask to match fixslicing and r12, r10, r7, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00400000 //add rconst strd r11, r12, [r0, #24] //store 2nd half tk for 37th round and r11, r10, r8, ror #16 //ror and mask to match fixslicing and r12, r10, r9, ror #16 //ror and mask to match fixslicing eor r11, r11, #0x00440000 //add rconst eor r12, r12, #0x00500000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0, #16] //store 1st half tk for 37th round and r10, r10, r10, lsr #2 //r10<- 0x30303030 and r11, r10, r6, ror #14 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #6 and r11, r10, r7, ror #14 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #6 and r11, r10, r8, ror #14 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #6 and r11, r10, r9, ror #14 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #6 //ror and masks to match fixslicing --- eor r6, r6, #0x00100000 //add rconst eor r7, r7, #0x00100000 //add rconst eor r8, r8, #0x00000001 //add rconst eor r9, r9, #0x00100000 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 36th round strd r8, r9, [r0], #24 //store 2nd half tk for 36th round ldm r0, {r6-r9} //load tk bl p6 //apply the permutation 6 times movw r10, #0xc3c3 movt r10, #0xc3c3 //r10<- 0xc3c3c3c3 and r11, r10, r6, ror #10 //ror and mask to match fixslicing and r12, r10, r7, ror #10 //ror and mask to match fixslicing eor r12, r12, #0x01000000 //add rconst strd r11, r12, [r0], #8 //store 1st half tk for 38th round and r11, r10, r8, ror #10 //ror and mask to match fixslicing and r12, r10, r9, ror #10 //ror and mask to match fixslicing eor r11, r11, #0x01400000 //add rconst eor r11, r11, #0x00001000 //add rconst eor r12, r12, #0x00400000 //add rconst mvn r12, r12 //to save a NOT in sbox calculations strd r11, r12, [r0], #8 //store 2nd half tk for 38th round and r10, r10, r10, lsr #6 //r10<- 0x03030303 and r11, r10, r6, ror #12 //--- ror and masks to match fixslicing and r6, r6, r10, lsl #6 orr r6, r11, r6, ror #28 and r11, r10, r7, ror #12 and r7, r7, r10, lsl #6 orr r7, r11, r7, ror #28 and r11, r10, r8, ror #12 and r8, r8, r10, lsl #6 orr r8, r11, r8, ror #28 and r11, r10, r9, ror #12 and r9, r9, r10, lsl #6 orr r9, r11, r9, ror #28 //ror and masks to match fixslicing --- eor r6, r6, #0x00000400 //add rconst eor r7, r7, #0x00000400 //add rconst eor r8, r8, #0x01000000 eor r8, r8, #0x00004000 //add rconst eor r9, r9, #0x00000400 //add rconst mvn r9, r9 //to save a NOT in sbox calculations strd r8, r9, [r0], #8 //store 1st half tk for 39th round strd r6, r7, [r0], #8 //store 2nd half tk for 39th round ldm r0, {r6-r9} //load tk bl p8 //apply the permutation 8 times movw r10, #0x3030 movt r10, #0x3030 //r10<- 0x30303030 and r11, r10, r6, ror #30 //--- ror and masks to match fixslicing and r6, r6, r10, ror #4 orr r6, r11, r6, ror #22 and r11, r10, r7, ror #30 and r7, r7, r10, ror #4 orr r7, r11, r7, ror #22 and r11, r10, r8, ror #30 and r8, r8, r10, ror #4 orr r8, r11, r8, ror #22 and r11, r10, r9, ror #30 and r9, r9, r10, ror #4 orr r9, r11, r9, ror #22 //ror and masks to match fixslicing --- eor r6, r6, #0x00000010 eor r8, r8, #0x00010000 eor r8, r8, #0x00000010 eor r9, r9, #0x00000400 mvn r9, r9 //to save a NOT in sbox calculations strd r6, r7, [r0], #8 //store 1st half tk for 39th round strd r8, r9, [r0] //store 2nd half tk for 39th round add.w sp, #4 //restore stack pointer pop {r0-r12, lr} bx lr /****************************************************************************** * Applies the permutations P^2, ..., P^14 for rounds 0 to 16. Since P^16=Id, we * dont need more calculations as no LFSR is applied to TK1. ******************************************************************************/ @ void tkschedule_perm_tk1(u32* tk, const u8* key) .global tkschedule_perm_tk1 .type tkschedule_perm_tk1,%function .align 2 tkschedule_perm_tk1: push {r0-r12, lr} ldr.w r3, [r1, #8] //load tk1 (3rd word) ldr.w r4, [r1, #4] //load tk1 (2nd word) ldr.w r5, [r1, #12] //load tk1 (4th word) ldr.w r2, [r1] //load tk1 (1st word) movw r10, #0x0a0a movt r10, #0x0a0a //r6 <- 0x0a0a0a0a movw r11, #0x3030 movt r11, #0x3030 //r7 <- 0x30303030 bl packing //pack tk1 mov r6, r2 //move tk1 from r2-r5 to r6-r9 mov r7, r3 //move tk1 from r2-r5 to r6-r9 mov r8, r4 //move tk1 from r2-r5 to r6-r9 mov r9, r5 //move tk1 from r2-r5 to r6-r9 movw r2, #0xf0f0 movt r2, #0xf0f0 //r2<- 0xf0f0f0f0 and r11, r8, r2 //tk &= 0xf0f0f0f0 (3rd word) and r12, r9, r2 //tk &= 0xf0f0f0f0 (4th word) strd r11, r12, [r0], #8 //store 1st half tk for 1st round and r11, r6, r2 //tk &= 0xf0f0f0f0 (1st word) and r12, r7, r2 //tk &= 0xf0f0f0f0 (2nd word) strd r11, r12, [r0], #8 //store 2nd half tk for 1st round movw r3, #0x3030 movt r3, #0x3030 //r3 <- 0x30303030 and r11, r3, r6, ror #30 //--- ror and masks to match fixslicing and r12, r6, r3, ror #4 orr r12, r11, r12, ror #22 str.w r12, [r0, #224] and r11, r3, r7, ror #30 and r12, r7, r3, ror #4 orr r12, r11, r12, ror #22 str.w r12, [r0, #228] and r11, r3, r8, ror #30 and r12, r8, r3, ror #4 orr r12, r11, r12, ror #22 str.w r12, [r0, #232] and r11, r3, r9, ror #30 and r12, r9, r3, ror #4 orr r12, r11, r12, ror #22 //ror and masks to match fixslicing --- str.w r12, [r0, #236] bl p2 //apply the permutation twice movw r3, #0xc3c3 movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3 and r11, r3, r6, ror #26 //ror and mask to match fixslicing and r12, r3, r7, ror #26 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 2nd round and r11, r3, r8, ror #26 //ror and mask to match fixslicing and r12, r3, r9, ror #26 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 2nd half tk for 2nd round and r3, r3, r3, lsr #6 //r3<- 0x03030303 and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing and r12, r6, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0, #8] and r11, r3, r7, ror #28 and r12, r7, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0, #12] and r11, r3, r9, ror #28 and r12, r9, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0, #4] and r11, r3, r8, ror #28 and r12, r8, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0], #16 //ror and masks to match fixslicing --- bl p2 //apply the permutation 4 times lsl r3, r3, #4 //r3 <- 0x30303030 and r11, r3, r6, ror #14 //--- ror and masks to match fixslicing and r12, r6, r3, ror #4 orr r12, r11, r12, ror #6 str.w r12, [r0], #4 and r11, r3, r7, ror #14 and r12, r7, r3, ror #4 orr r12, r11, r12, ror #6 str.w r12, [r0], #4 and r11, r3, r8, ror #14 and r12, r8, r3, ror #4 orr r12, r11, r12, ror #6 str.w r12, [r0], #4 and r11, r3, r9, ror #14 and r12, r9, r3, ror #4 orr r12, r11, r12, ror #6 //ror and masks to match fixslicing --- str.w r12, [r0], #4 and r11, r2, r6, ror #16 //ror and mask to match fixslicing and r12, r2, r7, ror #16 //ror and mask to match fixslicing strd r11, r12, [r0, #8] //store 2nd half tk for 5th round and r11, r2, r8, ror #16 //ror and mask to match fixslicing and r12, r2, r9, ror #16 //ror and mask to match fixslicing strd r11, r12, [r0], #16 //store 1st half tk for 5th round bl p2 //apply the permutation twice movw r3, #0xc3c3 movt r3, #0xc3c3 //r3<- 0xc3c3c3c3 and r11, r3, r6, ror #10 //ror and mask to match fixslicing and r12, r3, r7, ror #10 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 6th round and r11, r3, r8, ror #10 //ror and mask to match fixslicing and r12, r3, r9, ror #10 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 2nd half tk for 6th round and r3, r3, r3, lsr #6 //r3<- 0x03030303 and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing and r12, r6, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0, #8] and r11, r3, r7, ror #12 and r12, r7, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0, #12] and r11, r3, r9, ror #12 and r12, r9, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0, #4] and r11, r3, r8, ror #12 and r12, r8, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0], #16 //ror and masks to match fixslicing --- bl p2 //apply the permutation 8 times lsl r3, r3, #4 //r3 <- 0x30303030 and r11, r3, r6, ror #30 //--- ror and masks to match fixslicing and r12, r6, r3, ror #4 orr r12, r11, r12, ror #22 str.w r12, [r0], #4 and r11, r3, r7, ror #30 and r12, r7, r3, ror #4 orr r12, r11, r12, ror #22 str.w r12, [r0], #4 and r11, r3, r8, ror #30 and r12, r8, r3, ror #4 orr r12, r11, r12, ror #22 str.w r12, [r0], #4 and r11, r3, r9, ror #30 and r12, r9, r3, ror #4 orr r12, r11, r12, ror #22 //ror and masks to match fixslicing --- str.w r12, [r0], #4 and r11, r2, r6 //ror and mask to match fixslicing and r12, r2, r7 //ror and mask to match fixslicing strd r11, r12, [r0, #8] //store 2nd half tk for 9th round and r11, r2, r8 //ror and mask to match fixslicing and r12, r2, r9 //ror and mask to match fixslicing strd r11, r12, [r0], #16 //store 1st half tk for 9th round bl p2 //apply the permutation 10 movw r3, #0xc3c3 movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3 and r11, r3, r6, ror #26 //ror and mask to match fixslicing and r12, r3, r7, ror #26 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 10th round and r11, r3, r8, ror #26 //ror and mask to match fixslicing and r12, r3, r9, ror #26 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 2nd half tk for 10th round and r3, r3, r3, lsr #6 //r3 <- 0x03030303 and r11, r3, r6, ror #28 //--- ror and masks to match fixslicing and r12, r6, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0, #8] and r11, r3, r7, ror #28 and r12, r7, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0, #12] and r11, r3, r9, ror #28 and r12, r9, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0, #4] and r11, r3, r8, ror #28 and r12, r8, r3, lsl #6 orr r12, r11, r12, ror #12 str.w r12, [r0], #16 //ror and masks to match fixslicing --- bl p2 //apply the permutation 12 times lsl r3, r3, #4 //r3 <- 0x30303030 and r11, r3, r6, ror #14 //--- ror and masks to match fixslicing and r12, r6, r3, ror #4 orr r12, r11, r12, ror #6 str.w r12, [r0], #4 and r11, r3, r7, ror #14 and r12, r7, r3, ror #4 orr r12, r11, r12, ror #6 str.w r12, [r0], #4 and r11, r3, r8, ror #14 and r12, r8, r3, ror #4 orr r12, r11, r12, ror #6 str.w r12, [r0], #4 and r11, r3, r9, ror #14 and r12, r9, r3, ror #4 orr r12, r11, r12, ror #6 //ror and masks to match fixslicing --- str.w r12, [r0], #4 and r11, r2, r6, ror #16 //ror and mask to match fixslicing and r12, r2, r7, ror #16 //ror and mask to match fixslicing strd r11, r12, [r0, #8] //store 2nd half tk for 5th round and r11, r2, r8, ror #16 //ror and mask to match fixslicing and r12, r2, r9, ror #16 //ror and mask to match fixslicing strd r11, r12, [r0], #16 //store 1st half tk for 5th round bl p2 //apply the permutation 14 times movw r3, #0xc3c3 movt r3, #0xc3c3 //r3 <- 0xc3c3c3c3 and r11, r3, r6, ror #10 //ror and mask to match fixslicing and r12, r3, r7, ror #10 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 1st half tk for 14th round and r11, r3, r8, ror #10 //ror and mask to match fixslicing and r12, r3, r9, ror #10 //ror and mask to match fixslicing strd r11, r12, [r0], #8 //store 2nd half tk for 14th round and r3, r3, r3, lsr #6 //r3 <- 0x03030303 and r11, r3, r6, ror #12 //--- ror and masks to match fixslicing and r12, r6, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0, #8] and r11, r3, r7, ror #12 and r12, r7, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0, #12] and r11, r3, r9, ror #12 and r12, r9, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0, #4] and r11, r3, r8, ror #12 and r12, r8, r3, lsl #6 orr r12, r11, r12, ror #28 str.w r12, [r0], #16 //ror and masks to match fixslicing --- pop {r0-r12, lr} bx lr .align 2 quadruple_round: orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 eor r8, r2, r5 and r8, r8, r6 eor r2, r2, r8 eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0); ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11 eor r2, r2, r8 //add rtk_2_3 + rconst eor r3, r3, r9 //add rtk_2_3 + rconst eor r4, r4, r10 //add rtk_2_3 + rconst eor r5, r5, r11 //add rtk_2_3 + rconst ldmia.w r0!,{r8-r11} eor r2, r2, r8 //add rtk_1 eor r3, r3, r9 //add rtk_1 eor r4, r4, r10 //add rtk_1 eor r5, r5, r11 //add rtk_1 and r8, r7, r2, ror #30 // --- mixcolumns 0 --- eor r2, r2, r8, ror #24 and r8, r7, r2, ror #18 eor r2, r2, r8, ror #2 and r8, r7, r2, ror #6 eor r2, r2, r8, ror #4 and r8, r7, r3, ror #30 eor r3, r3, r8, ror #24 and r8, r7, r3, ror #18 eor r3, r3, r8, ror #2 and r8, r7, r3, ror #6 eor r3, r3, r8, ror #4 and r8, r7, r4, ror #30 eor r4, r4, r8, ror #24 and r8, r7, r4, ror #18 eor r4, r4, r8, ror #2 and r8, r7, r4, ror #6 eor r4, r4, r8, ror #4 and r8, r7, r5, ror #30 eor r5, r5, r8, ror #24 and r8, r7, r5, ror #18 eor r5, r5, r8, ror #2 and r8, r7, r5, ror #6 eor r5, r5, r8, ror #4 orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 eor r8, r3, r4 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0); ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11 eor r2, r2, r8 //add rkey + rconst eor r3, r3, r9 //add rkey + rconst eor r4, r4, r10 //add rkey + rconst eor r5, r5, r11 //add rkey + rconst ldmia.w r0!,{r8-r11} eor r2, r2, r8 //add rtk_1 eor r3, r3, r9 //add rtk_1 eor r4, r4, r10 //add rtk_1 eor r5, r5, r11 //add rtk_1 and r8, r7, r2, ror #16 // --- mixcolumns 1 --- eor r2, r2, r8, ror #30 and r8, r7, r2, ror #28 eor r2, r2, r8 and r8, r7, r2, ror #16 eor r2, r2, r8, ror #2 and r8, r7, r3, ror #16 eor r3, r3, r8, ror #30 and r8, r7, r3, ror #28 eor r3, r3, r8 and r8, r7, r3, ror #16 eor r3, r3, r8, ror #2 and r8, r7, r4, ror #16 eor r4, r4, r8, ror #30 and r8, r7, r4, ror #28 eor r4, r4, r8 and r8, r7, r4, ror #16 eor r4, r4, r8, ror #2 and r8, r7, r5, ror #16 eor r5, r5, r8, ror #30 and r8, r7, r5, ror #28 eor r5, r5, r8 and r8, r7, r5, ror #16 eor r5, r5, r8, ror #2 orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 eor r8, r2, r5 and r8, r8, r6 eor r2, r2, r8 eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0); ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11 eor r2, r2, r8 //add rtk_2_3 + rconst eor r3, r3, r9 //add rtk_2_3 + rconst eor r4, r4, r10 //add rtk_2_3 + rconst eor r5, r5, r11 //add rtk_2_3 + rconst ldmia.w r0!,{r8-r11} eor r2, r2, r8 //add rtk_1 eor r3, r3, r9 //add rtk_1 eor r4, r4, r10 //add rtk_1 eor r5, r5, r11 //add rtk_1 and r8, r7, r2, ror #10 // --- mixcolumns 2 --- eor r2, r2, r8, ror #4 and r8, r7, r2, ror #6 eor r2, r2, r8, ror #6 and r8, r7, r2, ror #26 eor r2, r2, r8 and r8, r7, r3, ror #10 eor r3, r3, r8, ror #4 and r8, r7, r3, ror #6 eor r3, r3, r8, ror #6 and r8, r7, r3, ror #26 eor r3, r3, r8 and r8, r7, r4, ror #10 eor r4, r4, r8, ror #4 and r8, r7, r4, ror #6 eor r4, r4, r8, ror #6 and r8, r7, r4, ror #26 eor r4, r4, r8 and r8, r7, r5, ror #10 eor r5, r5, r8, ror #4 and r8, r7, r5, ror #6 eor r5, r5, r8, ror #6 and r8, r7, r5, ror #26 eor r5, r5, r8 orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 eor r8, r3, r4 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0); ldmia.w r1!, {r8-r11} //load rkeys in r8,...,r11 eor r2, r2, r8 //add rkey + rconst eor r3, r3, r9 //add rkey + rconst eor r4, r4, r10 //add rkey + rconst eor r5, r5, r11 //add rkey + rconst ldmia.w r0!,{r8-r11} eor r2, r2, r8 //add rtk_1 eor r3, r3, r9 //add rtk_1 eor r4, r4, r10 //add rtk_1 eor r5, r5, r11 //add rtk_1 and r8, r7, r2, ror #4 // --- mixcolumns 3 --- eor r2, r2, r8, ror #26 and r8, r7, r2 eor r2, r2, r8, ror #4 and r8, r7, r2, ror #4 eor r2, r2, r8, ror #22 and r8, r7, r3, ror #4 eor r3, r3, r8, ror #26 and r8, r7, r3 eor r3, r3, r8, ror #4 and r8, r7, r3, ror #4 eor r3, r3, r8, ror #22 and r8, r7, r4, ror #4 eor r4, r4, r8, ror #26 and r8, r7, r4 eor r4, r4, r8, ror #4 and r8, r7, r4, ror #4 eor r4, r4, r8, ror #22 and r8, r7, r5, ror #4 eor r5, r5, r8, ror #26 and r8, r7, r5 eor r5, r5, r8, ror #4 and r8, r7, r5, ror #4 eor r5, r5, r8, ror #22 bx lr /****************************************************************************** * Inverse quadruple round of fixsliced SKINNY-128. ******************************************************************************/ .align 2 inv_quadruple_round: and r8, r7, r2, ror #4 // --- mixcolumns 3 --- eor r2, r2, r8, ror #22 and r8, r7, r2 eor r2, r2, r8, ror #4 and r8, r7, r2, ror #4 eor r2, r2, r8, ror #26 and r8, r7, r3, ror #4 eor r3, r3, r8, ror #22 and r8, r7, r3 eor r3, r3, r8, ror #4 and r8, r7, r3, ror #4 eor r3, r3, r8, ror #26 and r8, r7, r4, ror #4 eor r4, r4, r8, ror #22 and r8, r7, r4 eor r4, r4, r8, ror #4 and r8, r7, r4, ror #4 eor r4, r4, r8, ror #26 and r8, r7, r5, ror #4 eor r5, r5, r8, ror #22 and r8, r7, r5 eor r5, r5, r8, ror #4 and r8, r7, r5, ror #4 eor r5, r5, r8, ror #26 ldrd r10, r11, [r1], #-8 ldrd r8, r9, [r1], #-8 eor r2, r2, r8 //add rkey + rconst eor r3, r3, r9 //add rkey + rconst eor r4, r4, r10 //add rkey + rconst eor r5, r5, r11 //add rkey + rconst ldrd r10, r11, [r0], #-8 ldrd r8, r9, [r0], #-8 eor r2, r2, r8 //add rtk1 eor r3, r3, r9 //add rtk1 eor r4, r4, r10 //add rtk1 eor r5, r5, r11 //add rtk1 eor r8, r3, r4 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0); orr r8, r2, r3 eor r5, r5, r8 eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 and r8, r7, r2, ror #26 // --- mixcolumns 2 --- eor r2, r2, r8 and r8, r7, r2, ror #6 eor r2, r2, r8, ror #6 and r8, r7, r2, ror #10 eor r2, r2, r8, ror #4 and r8, r7, r3, ror #26 eor r3, r3, r8 and r8, r7, r3, ror #6 eor r3, r3, r8, ror #6 and r8, r7, r3, ror #10 eor r3, r3, r8, ror #4 and r8, r7, r4, ror #26 eor r4, r4, r8 and r8, r7, r4, ror #6 eor r4, r4, r8, ror #6 and r8, r7, r4, ror #10 eor r4, r4, r8, ror #4 and r8, r7, r5, ror #26 eor r5, r5, r8 and r8, r7, r5, ror #6 eor r5, r5, r8, ror #6 and r8, r7, r5, ror #10 eor r5, r5, r8, ror #4 ldrd r10, r11, [r1], #-8 ldrd r8, r9, [r1], #-8 eor r2, r2, r8 //add rk2_3 + rconst eor r3, r3, r9 //add rk2_3 + rconst eor r4, r4, r10 //add rk2_3 + rconst eor r5, r5, r11 //add rk2_3 + rconst ldrd r10, r11, [r0], #-8 ldrd r8, r9, [r0], #-8 eor r2, r2, r8 //add rtk1 eor r3, r3, r9 //add rtk1 eor r4, r4, r10 //add rtk1 eor r5, r5, r11 //add rtk1 eor r8, r2, r5 and r8, r8, r6 eor r2, r2, r8 eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0); orr r8, r4, r5 eor r3, r3, r8 eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 and r8, r7, r2, ror #16 // --- mixcolumns 1 --- eor r2, r2, r8, ror #2 and r8, r7, r2, ror #28 eor r2, r2, r8 and r8, r7, r2, ror #16 eor r2, r2, r8, ror #30 and r8, r7, r3, ror #16 eor r3, r3, r8, ror #2 and r8, r7, r3, ror #28 eor r3, r3, r8 and r8, r7, r3, ror #16 eor r3, r3, r8, ror #30 and r8, r7, r4, ror #16 eor r4, r4, r8, ror #2 and r8, r7, r4, ror #28 eor r4, r4, r8 and r8, r7, r4, ror #16 eor r4, r4, r8, ror #30 and r8, r7, r5, ror #16 eor r5, r5, r8, ror #2 and r8, r7, r5, ror #28 eor r5, r5, r8 and r8, r7, r5, ror #16 eor r5, r5, r8, ror #30 ldrd r10, r11, [r1], #-8 ldrd r8, r9, [r1], #-8 eor r2, r2, r8 //add rkey + rconst eor r3, r3, r9 //add rkey + rconst eor r4, r4, r10 //add rkey + rconst eor r5, r5, r11 //add rkey + rconst ldrd r10, r11, [r0], #-8 ldrd r8, r9, [r0], #-8 eor r2, r2, r8 //add rtk1 eor r3, r3, r9 //add rtk1 eor r4, r4, r10 //add rtk1 eor r5, r5, r11 //add rtk1 eor r8, r3, r4 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8 //SWAPMOVE(r4, r3, 0x55555555, 0); orr r8, r2, r3 eor r5, r5, r8 eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 and r8, r7, r2, ror #6 // --- mixcolumns 0 --- eor r2, r2, r8, ror #4 and r8, r7, r2, ror #18 eor r2, r2, r8, ror #2 and r8, r7, r2, ror #30 eor r2, r2, r8, ror #24 and r8, r7, r3, ror #6 eor r3, r3, r8, ror #4 and r8, r7, r3, ror #18 eor r3, r3, r8, ror #2 and r8, r7, r3, ror #30 eor r3, r3, r8, ror #24 and r8, r7, r4, ror #6 eor r4, r4, r8, ror #4 and r8, r7, r4, ror #18 eor r4, r4, r8, ror #2 and r8, r7, r4, ror #30 eor r4, r4, r8, ror #24 and r8, r7, r5, ror #6 eor r5, r5, r8, ror #4 and r8, r7, r5, ror #18 eor r5, r5, r8, ror #2 and r8, r7, r5, ror #30 eor r5, r5, r8, ror #24 ldrd r10, r11, [r1], #-8 ldrd r8, r9, [r1], #-8 eor r2, r2, r8 //add rkey + rconst eor r3, r3, r9 //add rkey + rconst eor r4, r4, r10 //add rkey + rconst eor r5, r5, r11 //add rkey + rconst ldrd r10, r11, [r0], #-8 ldrd r8, r9, [r0], #-8 eor r2, r2, r8 //add rtk1 eor r3, r3, r9 //add rtk1 eor r4, r4, r10 //add rtk1 eor r5, r5, r11 //add rtk1 eor r8, r2, r5 and r8, r8, r6 eor r2, r2, r8 eor r5, r5, r8 //SWAPMOVE(r5, r2, 0x55555555, 0); orr r8, r4, r5 eor r3, r3, r8 eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 eor r8, r5, r2, lsr #1 and r8, r8, r6 eor r5, r5, r8 eor r2, r2, r8, lsl #1 //SWAPMOVE(r2, r5, 0x55555555, 1); eor r8, r2, r3, lsr #1 and r8, r8, r6 eor r2, r2, r8 eor r3, r3, r8, lsl #1 //SWAPMOVE(r3, r2, 0x55555555, 1); orr r8, r4, r5 eor r3, r3, r8 mvn r3, r3 eor r8, r4, r5, lsr #1 and r8, r8, r6 eor r4, r4, r8 eor r5, r5, r8, lsl #1 //SWAPMOVE(r5, r4, 0x55555555, 1); eor r8, r3, r4, lsr #1 and r8, r8, r6 eor r3, r3, r8 eor r4, r4, r8, lsl #1 //SWAPMOVE(r4, r3, 0x55555555, 1); orr r8, r2, r3 eor r5, r5, r8 mvn r5, r5 bx lr /****************************************************************************** * Encrypt a single block using fixsliced SKINNY-128-384+. ******************************************************************************/ @ void skinny128_384(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1) .global skinny128_384 .type skinny128_384,%function .align 2 skinny128_384: push {r0-r12, r14} mov.w r0, r3 ldr.w r3, [r2, #8] ldr.w r4, [r2, #4] ldr.w r5, [r2, #12] ldr.w r2, [r2] movw r10, #0x0a0a movt r10, #0x0a0a //r10 <- 0x0a0a0a0a movw r11, #0x3030 movt r11, #0x3030 //r11 <- 0x30303030 bl packing mov r7, r11 movw r6, #0x5555 movt r6, #0x5555 //r6 <- 0x55555555 bl quadruple_round bl quadruple_round bl quadruple_round bl quadruple_round sub.w r0, #256 // rtk1 repeats every 16 rounds bl quadruple_round bl quadruple_round bl quadruple_round bl quadruple_round sub.w r0, #256 // rtk1 repeats every 16 rounds bl quadruple_round bl quadruple_round bl unpacking ldr.w r0, [sp], #4 strd r2, r4, [r0] strd r3, r5, [r0, #8] pop {r1-r12,r14} bx lr /****************************************************************************** * Decrypt a single block using fixsliced SKINNY-128-384+. ******************************************************************************/ @ void skinny128_384_inv(u8* ctext, const u32* tk, const u8* ptext, const u32* rtk1) .global skinny128_384_inv .type skinny128_384_inv,%function .align 2 skinny128_384_inv: push {r0-r12, r14} mov.w r0, r3 ldr.w r3, [r2, #8] ldr.w r4, [r2, #4] ldr.w r5, [r2, #12] ldr.w r2, [r2] movw r10, #0x0a0a movt r10, #0x0a0a //r10 <- 0x0a0a0a0a movw r11, #0x3030 movt r11, #0x3030 //r11 <- 0x30303030 bl packing mov r7, r11 movw r6, #0x5555 movt r6, #0x5555 //r6 <- 0x55555555 add.w r0, #120 // points to the right rtk1 add.w r1, #632 // points to the last rtk2_3 bl inv_quadruple_round bl inv_quadruple_round add.w r0, #256 // rtk1 repeats every 16 rounds bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round add.w r0, #256 // rtk1 repeats every 16 rounds bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round bl unpacking ldr.w r0, [sp], #4 strd r2, r4, [r0] strd r3, r5, [r0, #8] pop {r1-r12,r14} bx lr