/******************************************************************************* * Constant-time ARM assembly implementation of the SKINNY block cipher. * Two blocks are processed in parallel. * * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * * @date April 2020 *******************************************************************************/ .syntax unified .thumb /******************************************************************************* * Applies P^2 on the tweakey state in a bitsliced manner. *******************************************************************************/ .align 2 p2: movw r3, #0xcc00 movt r3, #0xcc00 //r1 <- 0xcc00cc00 movw r4, #0xcc00 movt r4, #0x0033 //r10<- 0xcc000033 and r1, r3, r5, ror #14 bfi r1, r5, #16, #8 and r2, r5, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r5 orr r1, r1, r2, lsr #8 and r2, r5, #0x00cc0000 orr r5, r1, r2, lsr #18 and r1, r3, r6, ror #14 bfi r1, r6, #16, #8 and r2, r6, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r6 orr r1, r1, r2, lsr #8 and r2, r6, #0x00cc0000 orr r6, r1, r2, lsr #18 and r1, r3, r7, ror #14 bfi r1, r7, #16, #8 and r2, r7, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r7 orr r1, r1, r2, lsr #8 and r2, r7, #0x00cc0000 orr r7, r1, r2, lsr #18 and r1, r3, r8, ror #14 bfi r1, r8, #16, #8 and r2, r8, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r8 orr r1, r1, r2, lsr #8 and r2, r8, #0x00cc0000 orr r8, r1, r2, lsr #18 and r1, r3, r9, ror #14 bfi r1, r9, #16, #8 and r2, r9, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r9 orr r1, r1, r2, lsr #8 and r2, r9, #0x00cc0000 orr r9, r1, r2, lsr #18 and r1, r3, r10, ror #14 bfi r1, r10, #16, #8 and r2, r10, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r10 orr r1, r1, r2, lsr #8 and r2, r10, #0x00cc0000 orr r10, r1, r2, lsr #18 and r1, r3, r11, ror #14 bfi r1, r11, #16, #8 and r2, r11, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r11 orr r1, r1, r2, lsr #8 and r2, r11, #0x00cc0000 orr r11, r1, r2, lsr #18 and r1, r3, r12, ror #14 bfi r1, r12, #16, #8 and r2, r12, #0xcc000000 orr r1, r1, r2, lsr #2 and r2, r4, r12 orr r1, r1, r2, lsr #8 and r2, r12, #0x00cc0000 orr r12, r1, r2, lsr #18 bx lr /******************************************************************************* * Applies P^4 on the tweakey state in a bitsliced manner. *******************************************************************************/ .align 2 p4: str.w r14, [sp] //store r14 on the stack movw r14, #0x00cc movt r14, #0xcc00 //r14<- 0xcc0000cc movw r3, #0xcc00 movt r3, #0x3300 //r3 <- 0x3300cc00 movw r4, #0x00cc movt r4, #0x00cc //r4 <- 0x00cc00cc and r2, r14, r5, ror #22 and r1, r3, r5, ror #16 orr r2, r2, r1 and r1, r5, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r5, r5, r1 orr r5, r2, r5, ror #24 and r2, r14, r6, ror #22 and r1, r3, r6, ror #16 orr r2, r2, r1 and r1, r6, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r6, r6, r1 orr r6, r2, r6, ror #24 and r2, r14, r7, ror #22 and r1, r3, r7, ror #16 orr r2, r2, r1 and r1, r7, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r7, r7, r1 orr r7, r2, r7, ror #24 and r2, r14, r8, ror #22 and r1, r3, r8, ror #16 orr r2, r2, r1 and r1, r8, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r8, r8, r1 orr r8, r2, r8, ror #24 and r2, r14, r9, ror #22 and r1, r3, r9, ror #16 orr r2, r2, r1 and r1, r9, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r9, r9, r1 orr r9, r2, r9, ror #24 and r2, r14, r10, ror #22 and r1, r3, r10, ror #16 orr r2, r2, r1 and r1, r10, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r10, r10, r1 orr r10, r2, r10, ror #24 and r2, r14, r11, ror #22 and r1, r3, r11, ror #16 orr r2, r2, r1 and r1, r11, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r11, r11, r1 orr r11, r2, r11, ror #24 and r2, r14, r12, ror #22 and r1, r3, r12, ror #16 orr r2, r2, r1 and r1, r12, r4 orr r2, r2, r1, lsr #2 movw r1, #0xcc33 //r1 <- 0x0000cc33 and r12, r12, r1 orr r12, r2, r12, ror #24 ldr.w r14, [sp] //restore r14 bx lr /******************************************************************************* * Applies P^6 on the tweakey state in a bitsliced manner *******************************************************************************/ .align 2 p6: movw r3, #0x3333 //r1 <- 0x00003333 movw r4, #0x00cc movt r4, #0x3300 //r12<- 0x330000cc and r1, r5, r3, ror #8 // --- permute r5 6 times and r2, r4, r5, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r5, ror #10 orr r2, r2, r1 and r1, r5, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r5, #0x00003300 orr r5, r2, r1, lsl #2 // permute r5 6 times --- and r1, r6, r3, ror #8 // --- permute r6 6 times and r2, r4, r6, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r6, ror #10 orr r2, r2, r1 and r1, r6, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r6, #0x00003300 orr r6, r2, r1, lsl #2 // permute r6 6 times --- and r1, r7, r3, ror #8 // --- permute r7 6 times and r2, r4, r7, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r7, ror #10 orr r2, r2, r1 and r1, r7, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r7, #0x00003300 orr r7, r2, r1, lsl #2 // permute r7 6 times --- and r1, r8, r3, ror #8 // --- permute r8 6 times and r2, r4, r8, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r8, ror #10 orr r2, r2, r1 and r1, r8, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r8, #0x00003300 orr r8, r2, r1, lsl #2 // permute r8 6 times --- and r1, r9, r3, ror #8 // --- permute r9 6 times and r2, r4, r9, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r9, ror #10 orr r2, r2, r1 and r1, r9, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r9, #0x00003300 orr r9, r2, r1, lsl #2 // permute r9 6 times --- and r1, r10, r3, ror #8 // --- permute r10 6 times and r2, r4, r10, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r10, ror #10 orr r2, r2, r1 and r1, r10, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r10, #0x00003300 orr r10, r2, r1, lsl #2 // permute r10 6 times --- and r1, r11, r3, ror #8 // --- permute r11 6 times and r2, r4, r11, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r11, ror #10 orr r2, r2, r1 and r1, r11, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r11, #0x00003300 orr r11, r2, r1, lsl #2 // permute r11 6 times --- and r1, r12, r3, ror #8 // --- permute r12 6 times and r2, r4, r12, ror #24 orr r2, r2, r1, ror #6 and r1, r3, r12, ror #10 orr r2, r2, r1 and r1, r12, #0x000000cc orr r2, r2, r1, lsl #14 and r1, r12, #0x00003300 orr r12, r2, r1, lsl #2 // permute r12 6 times --- bx lr /******************************************************************************* * Applies P^8 on the tweakey state in a bitsliced manner. *******************************************************************************/ .align 2 p8: movw r3, #0x3333 //r3 <- 0x00003333 movw r4, #0x0000 movt r4, #0x33cc //r4 <- 0x33cc0000 and r1, r5, r4 // --- permute r5 8 times and r2, r4, r5, ror #8 orr r2, r2, r1, ror #24 and r1, r5, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r5, r3, lsl #8 orr r5, r2, r1, lsr #6 // permute r5 8 times --- and r1, r6, r4 // --- permute r6 8 times and r2, r4, r6, ror #8 orr r2, r2, r1, ror #24 and r1, r6, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r6, r3, lsl #8 orr r6, r2, r1, lsr #6 // permute r6 8 times --- and r1, r7, r4 // --- permute r7 8 times and r2, r4, r7, ror #8 orr r2, r2, r1, ror #24 and r1, r7, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r7, r3, lsl #8 orr r7, r2, r1, lsr #6 // permute r7 8 times --- and r1, r8, r4 // --- permute r8 8 times and r2, r4, r8, ror #8 orr r2, r2, r1, ror #24 and r1, r8, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r8, r3, lsl #8 orr r8, r2, r1, lsr #6 // permute r8 8 times --- and r1, r9, r4 // --- permute r9 8 times and r2, r4, r9, ror #8 orr r2, r2, r1, ror #24 and r1, r9, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r9, r3, lsl #8 orr r9, r2, r1, lsr #6 // permute r9 8 times --- and r1, r10, r4 // --- permute r10 8 times and r2, r4, r10, ror #8 orr r2, r2, r1, ror #24 and r1, r10, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r10, r3, lsl #8 orr r10, r2, r1, lsr #6 // permute r10 8 times --- and r1, r11, r4 // --- permute r11 8 times and r2, r4, r11, ror #8 orr r2, r2, r1, ror #24 and r1, r11, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r11, r3, lsl #8 orr r11, r2, r1, lsr #6 // permute r11 8 times --- and r1, r12, r4 // --- permute r12 8 times and r2, r4, r12, ror #8 orr r2, r2, r1, ror #24 and r1, r12, r3, lsl #2 orr r2, r2, r1, ror #26 and r1, r12, r3, lsl #8 orr r12, r2, r1, lsr #6 // permute r12 8 times --- bx lr /******************************************************************************* * Applies P^10 on the tweakey state in a bitsliced manner. *******************************************************************************/ .align 2 p10: movw r4, #0x0033 movt r4, #0x3300 //r4 <- 0x33000033 movw r3, #0xcc33 //r3 <- 0x0000cc33 and r1, r5, r3, ror #8 // --- permute r5 10 times and r2, r4, r5, ror #26 orr r2, r2, r1, ror #8 and r1, r5, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r5, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r5, #0x0000cc00 orr r5, r2, r1, lsr #2 // permute r5 10 times --- and r1, r6, r3, ror #8 // --- permute r6 10 times and r2, r4, r6, ror #26 orr r2, r2, r1, ror #8 and r1, r6, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r6, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r6, #0x0000cc00 orr r6, r2, r1, lsr #2 // permute r6 10 times --- and r1, r7, r3, ror #8 // --- permute r7 10 times and r2, r4, r7, ror #26 orr r2, r2, r1, ror #8 and r1, r7, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r7, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r7, #0x0000cc00 orr r7, r2, r1, lsr #2 // permute r7 10 times --- and r1, r8, r3, ror #8 // --- permute r8 10 times and r2, r4, r8, ror #26 orr r2, r2, r1, ror #8 and r1, r8, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r8, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r8, #0x0000cc00 orr r8, r2, r1, lsr #2 // permute r8 10 times --- and r1, r9, r3, ror #8 // --- permute r9 10 times and r2, r4, r9, ror #26 orr r2, r2, r1, ror #8 and r1, r9, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r9, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r9, #0x0000cc00 orr r9, r2, r1, lsr #2 // permute r9 10 times --- and r1, r10, r3, ror #8 // --- permute r10 10 times and r2, r4, r10, ror #26 orr r2, r2, r1, ror #8 and r1, r10, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r10, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r10, #0x0000cc00 orr r10, r2, r1, lsr #2 // permute r10 10 times --- and r1, r11, r3, ror #8 // --- permute r11 10 times and r2, r4, r11, ror #26 orr r2, r2, r1, ror #8 and r1, r11, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r11, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r11, #0x0000cc00 orr r11, r2, r1, lsr #2 // permute r11 10 times --- and r1, r12, r3, ror #8 // --- permute r12 10 times and r2, r4, r12, ror #26 orr r2, r2, r1, ror #8 and r1, r12, r4, ror #24 orr r2, r2, r1, ror #22 and r1, r12, #0x00330000 orr r2, r2, r1, lsr #14 and r1, r12, #0x0000cc00 orr r12, r2, r1, lsr #2 // permute r12 10 times --- bx lr /******************************************************************************* * Applies P^12 on the tweakey state in a bitsliced manner. *******************************************************************************/ .align 2 p12: str.w r14, [sp] //store r14 on the stack movw r14, #0xcc33 //r14<- 0x0000cc33 movw r4, #0x00cc movt r4, #0x00cc //r4 <- 0x00cc00cc movw r3, #0x3300 movt r3, #0xcc00 //r3 <- 0xcc003300 and r1, r14, r5, ror #8 // --- permute r5 12 times and r2, r4, r5, ror #30 orr r2, r2, r1 and r1, r3, r5, ror #16 orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r5, r1, ror #8 orr r5, r2, r1, ror #10 // permute r5 12 times --- and r1, r14, r6, ror #8 // --- permute r6 12 times and r2, r4, r6, ror #30 orr r2, r2, r1 and r1, r3, r6, ror #16 orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r6, r1, ror #8 orr r6, r2, r1, ror #10 // permute r6 12 times --- and r1, r14, r7, ror #8 // --- permute r7 12 times and r2, r4, r7, ror #30 orr r2, r2, r1 and r1, r3, r7, ror #16 orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r7, r1, ror #8 orr r7, r2, r1, ror #10 // permute r7 12 times --- and r1, r14, r8, ror #8 // --- permute r8 12 times and r2, r4, r8, ror #30 orr r2, r2, r1 and r1, r3, r8, ror #16 orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r8, r1, ror #8 orr r8, r2, r1, ror #10 // permute r8 12 times --- and r1, r14, r9, ror #8 // --- permute r9 12 times and r2, r4, r9, ror #30 orr r2, r2, r1 and r1, r3, r9, ror #16 orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r9, r1, ror #8 orr r9, r2, r1, ror #10 // permute r9 12 times --- and r1, r14, r10, ror #8 // --- permute r10 12 times and r2, r4, r10, ror #30 orr r2, r2, r1 and r1, r3, r10, ror #16 orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r10, r1, ror #8 orr r10, r2, r1, ror #10 // permute r10 12 times --- and r1, r14, r11, ror #8 // --- permute r11 12 times and r2, r4, r11, ror #30 orr r2, r2, r1 and r1, r3, r11, ror #16 orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r11, r1, ror #8 orr r11, r2, r1, ror #10 // permute r11 12 times --- and r1, r14, r12, ror #8 // --- permute r12 12 times and r2, r4, r12, ror #30 orr r2, r2, r1 and r1, r3, r12, ror #16 ldr.w r14, [sp] orr r2, r2, r1 movw r1, #0xcccc //r1 <- 0x0000cccc and r1, r12, r1, ror #8 orr r12, r2, r1, ror #10 // permute r12 12 times --- bx lr /******************************************************************************* * Applies P^14 on the tweakey state in a bitsliced manner. *******************************************************************************/ .align 2 p14: movw r3, #0xcc00 movt r3, #0x0033 //r3 <- 0x0033cc00 movw r4, #0xcc00 movt r4, #0xcc00 //r4 <- 0x33003300 and r1, r3, r5, ror #24 // --- permute r5 14 times and r2, r5, #0x00000033 orr r2, r1, r2, ror #14 and r1, r5, #0x33000000 orr r2, r2, r1, ror #30 and r1, r5, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r5, r4 orr r5, r2, r1, ror #18 // permute r5 14 times --- and r1, r3, r6, ror #24 // --- permute r6 14 times and r2, r6, #0x00000033 orr r2, r1, r2, ror #14 and r1, r6, #0x33000000 orr r2, r2, r1, ror #30 and r1, r6, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r6, r4 orr r6, r2, r1, ror #18 // permute r6 14 times --- and r1, r3, r7, ror #24 // --- permute r7 14 times and r2, r7, #0x00000033 orr r2, r1, r2, ror #14 and r1, r7, #0x33000000 orr r2, r2, r1, ror #30 and r1, r7, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r7, r4 orr r7, r2, r1, ror #18 // permute r7 14 times --- and r1, r3, r8, ror #24 // --- permute r8 14 times and r2, r8, #0x00000033 orr r2, r1, r2, ror #14 and r1, r8, #0x33000000 orr r2, r2, r1, ror #30 and r1, r8, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r8, r4 orr r8, r2, r1, ror #18 // permute r8 14 times --- and r1, r3, r9, ror #24 // --- permute r9 14 times and r2, r9, #0x00000033 orr r2, r1, r2, ror #14 and r1, r9, #0x33000000 orr r2, r2, r1, ror #30 and r1, r9, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r9, r4 orr r9, r2, r1, ror #18 // permute r9 14 times --- and r1, r3, r10, ror #24 // --- permute r10 14 times and r2, r10, #0x00000033 orr r2, r1, r2, ror #14 and r1, r10, #0x33000000 orr r2, r2, r1, ror #30 and r1, r10, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r10, r4 orr r10, r2, r1, ror #18 // permute r10 14 times --- and r1, r3, r11, ror #24 // --- permute r11 14 times and r2, r11, #0x00000033 orr r2, r1, r2, ror #14 and r1, r11, #0x33000000 orr r2, r2, r1, ror #30 and r1, r11, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r11, r4 orr r11, r2, r1, ror #18 // permute r11 14 times --- and r1, r3, r12, ror #24 // --- permute r12 14 times and r2, r12, #0x00000033 orr r2, r1, r2, ror #14 and r1, r12, #0x33000000 orr r2, r2, r1, ror #30 and r1, r12, #0x00ff0000 orr r2, r2, r1, ror #16 and r1, r12, r4 orr r12, r2, r1, ror #18 // permute r12 14 times --- bx lr /******************************************************************************* * Packs the input byte array into the fixsliced representation. *******************************************************************************/ .align 2 packing: eor r4, r5, r6, lsr #1 and r4, r4, r2 eor r5, r5, r4 eor r6, r6, r4, lsl #1 //SWAPMOVE(r6, r5, 0x55555555, 1) eor r4, r7, r8, lsr #1 and r4, r4, r2 eor r7, r7, r4 eor r8, r8, r4, lsl #1 //SWAPMOVE(r8, r7, 0x55555555, 1) eor r4, r9, r10, lsr #1 and r4, r4, r2 eor r9, r9, r4 eor r10, r10, r4, lsl #1 //SWAPMOVE(r10, r9, 0x55555555, 1) eor r4, r11, r12, lsr #1 and r4, r4, r2 eor r11, r11, r4 eor r12, r12, r4, lsl #1 //SWAPMOVE(r12, r11, 0x55555555, 1) eor r4, r5, r7, lsr #2 and r4, r4, r3 eor r5, r5, r4 eor r7, r7, r4, lsl #2 //SWAPMOVE(r7, r5, 0x30303030, 2) eor r4, r5, r9, lsr #4 and r4, r4, r3, lsr #2 eor r5, r5, r4 eor r9, r9, r4, lsl #4 //SWAPMOVE(r9, r5, 0x0c0c0c0c, 4) eor r4, r5, r11, lsr #6 and r4, r4, r3, lsr #4 eor r5, r5, r4 eor r11, r11, r4, lsl #6 //SWAPMOVE(r11, r5, 0x03030303, 6) eor r4, r6, r8, lsr #2 and r4, r4, r3 eor r6, r6, r4 eor r8, r8, r4, lsl #2 //SWAPMOVE(r8, r6, 0x30303030, 2) eor r4, r6, r10, lsr #4 and r4, r4, r3, lsr #2 eor r6, r6, r4 eor r10, r10, r4, lsl #4 //SWAPMOVE(r10, r6, 0x0c0c0c0c, 4) eor r4, r6, r12, lsr #6 and r4, r4, r3, lsr #4 eor r6, r6, r4 eor r12, r12, r4, lsl #6 //SWAPMOVE(r12, r6, 0x03030303, 6) eor r4, r7, r9, lsr #2 and r4, r4, r3, lsr #2 eor r7, r7, r4 eor r9, r9, r4, lsl #2 //SWAPMOVE(r9, r7, 0x0c0c0c0c, 2) eor r4, r7, r11, lsr #4 and r4, r4, r3, lsr #4 eor r7, r7, r4 eor r11, r11, r4, lsl #4 //SWAPMOVE(r11, r7, 0x03030303, 4) eor r4, r8, r10, lsr #2 and r4, r4, r3, lsr #2 eor r8, r8, r4 eor r10, r10, r4, lsl #2 //SWAPMOVE(r10, r8, 0x0c0c0c0c, 2) eor r4, r8, r12, lsr #4 and r4, r4, r3, lsr #4 eor r8, r8, r4 eor r12, r12, r4, lsl #4 //SWAPMOVE(r12, r8, 0x03030303, 4) eor r4, r9, r11, lsr #2 and r4, r4, r3, lsr #4 eor r9, r9, r4 eor r11, r11, r4, lsl #2 //SWAPMOVE(r11, r9, 0x03030303, 2) eor r4, r10, r12, lsr #2 and r4, r4, r3, lsr #4 eor r10, r10, r4 eor r12, r12, r4, lsl #2 //SWAPMOVE(r12, r10, 0x03030303, 2) bx lr /******************************************************************************* * Unpacks the internal state in fixsliced representation into a byte array. *******************************************************************************/ .align 2 unpacking: movw r2, #0x5555 movt r2, #0x5555 //r2 <- 0x55555555 movw r3, #0x3030 movt r3, #0x3030 //r3 <- 0x30303030 eor r4, r9, r11, lsr #2 and r4, r4, r3, lsr #4 eor r9, r9, r4 eor r11, r11, r4, lsl #2 //SWAPMOVE(r11, r9, 0x03030303, 2) eor r4, r10, r12, lsr #2 and r4, r4, r3, lsr #4 eor r10, r10, r4 eor r12, r12, r4, lsl #2 //SWAPMOVE(r12, r10, 0x03030303, 2) eor r4, r8, r10, lsr #2 and r4, r4, r3, lsr #2 eor r8, r8, r4 eor r10, r10, r4, lsl #2 //SWAPMOVE(r10, r8, 0x0c0c0c0c, 2) eor r4, r8, r12, lsr #4 and r4, r4, r3, lsr #4 eor r8, r8, r4 eor r12, r12, r4, lsl #4 //SWAPMOVE(r12, r8, 0x03030303, 4) eor r4, r7, r9, lsr #2 and r4, r4, r3, lsr #2 eor r7, r7, r4 eor r9, r9, r4, lsl #2 //SWAPMOVE(r9, r7, 0x0c0c0c0c, 2) eor r4, r7, r11, lsr #4 and r4, r4, r3, lsr #4 eor r7, r7, r4 eor r11, r11, r4, lsl #4 //SWAPMOVE(r11, r7, 0x03030303, 4) eor r4, r6, r12, lsr #6 and r4, r4, r3, lsr #4 eor r6, r6, r4 eor r12, r12, r4, lsl #6 //SWAPMOVE(r12, r6, 0x03030303, 6) eor r4, r6, r10, lsr #4 and r4, r4, r3, lsr #2 eor r6, r6, r4 eor r10, r10, r4, lsl #4 //SWAPMOVE(r10, r6, 0x0c0c0c0c, 4) eor r4, r6, r8, lsr #2 and r4, r4, r3 eor r6, r6, r4 eor r8, r8, r4, lsl #2 //SWAPMOVE(r8, r6, 0x30303030, 2) eor r4, r5, r11, lsr #6 and r4, r4, r3, lsr #4 eor r5, r5, r4 eor r11, r11, r4, lsl #6 //SWAPMOVE(r11, r5, 0x03030303, 6) eor r4, r5, r9, lsr #4 and r4, r4, r3, lsr #2 eor r5, r5, r4 eor r9, r9, r4, lsl #4 //SWAPMOVE(r9, r5, 0x0c0c0c0c, 4) eor r4, r5, r7, lsr #2 and r4, r4, r3 eor r5, r5, r4 eor r7, r7, r4, lsl #2 //SWAPMOVE(r7, r5, 0x30303030, 2) eor r4, r5, r6, lsr #1 and r4, r4, r2 eor r5, r5, r4 eor r6, r6, r4, lsl #1 //SWAPMOVE(r6, r5, 0x55555555, 1) eor r4, r7, r8, lsr #1 and r4, r4, r2 eor r7, r7, r4 eor r8, r8, r4, lsl #1 //SWAPMOVE(r8, r7, 0x55555555, 1) eor r4, r9, r10, lsr #1 and r4, r4, r2 eor r9, r9, r4 eor r10, r10, r4, lsl #1 //SWAPMOVE(r10, r9, 0x55555555, 1) eor r4, r11, r12, lsr #1 and r4, r4, r2 eor r11, r11, r4 eor r12, r12, r4, lsl #1 //SWAPMOVE(r12, r11, 0x55555555, 1) bx lr /****************************************************************************** * Compute TK = LFSR2(TK2) for all rounds. ******************************************************************************/ @ void tkschedule_lfsr_2(u32* rtk, const u8* tk2, const u8* tk2_bis, const int rounds) .global tkschedule_lfsr_2 .type tkschedule_lfsr_2,%function .align 2 tkschedule_lfsr_2: push {r0-r12, r14} ldm r1, {r5,r7,r9,r11} // load the 1st block in r5,r7,r9,r11 ldm r2, {r6,r8,r10,r12} // load the 2nd block in r6,r8,r10,r12 mov.w r1, r3 //load loop counter in r1 movw r2, #0x5555 movt r2, #0x5555 //r2 <- 0x55555555 movw r3, #0x3030 movt r3, #0x3030 //r3 <- 0x30303030 bl packing stmia r0!, {r5-r12} loop_2: eor r5, r5, r7 // apply LFSR2 to tk2 stmia r0!, {r6-r12} str.w r5, [r0], #36 subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // r1 = 0 => we are done eor r6, r6, r8 // apply LFSR2 to tk2 stmia r0!, {r7-r12} strd r5, r6, [r0], #40 subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // r1 = 0 => we are done eor r7, r7, r9 // apply LFSR2 to tk2 stmia r0!, {r8-r12} stmia r0!, {r5-r7} add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // r1 = 0 => we are done eor r8, r8, r10 // apply LFSR2 to tk2 stmia r0!, {r9-r12} stmia r0!, {r5-r8} add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // r1 = 0 => we are done eor r9, r9, r11 // apply LFSR2 to tk2 stmia r0!, {r10-r12} stmia r0!, {r5-r9} add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // r1 = 0 => we are done eor r10, r10, r12 // apply LFSR2 to tk2 strd r11, r12, [r0], #8 stmia r0!, {r5-r10} add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // r1 = 0 => we are done eor r11, r11, r5 // apply LFSR2 to tk2 str.w r12, [r0], #4 stmia r0!, {r5-r11} add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // r1 = 0 => we are done eor r12, r12, r6 // apply LFSR2 to tk2 stmia r0!, {r5-r12} add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 bne loop_2 // if not 0 then we run the loop again exit_lfsr: pop {r0-r12, r14} bx lr /****************************************************************************** * Compute TK ^= LFSR3(TK3) for all rounds. ******************************************************************************/ @ void tkschedule_lfsr_3(u32* rtk, const u8* tk2, const u8* tk2_bis, @ const int rounds) .global tkschedule_lfsr_3 .type tkschedule_lfsr_3,%function .align 2 tkschedule_lfsr_3: push {r0-r12, r14} ldm r1, {r5,r7,r9,r11} // load the 1st block in r5,r7,r9,r11 ldm r2, {r6,r8,r10,r12} // load the 2nd block in r6,r8,r10,r12 mov.w r1, r3 //load loop counter in r1 movw r2, #0x5555 movt r2, #0x5555 //r2 <- 0x55555555 movw r3, #0x3030 movt r3, #0x3030 //r3 <- 0x30303030 bl packing ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r5 // rtk <- tk2 ^ tk3 eor r3, r3, r6 // rtk <- tk2 ^ tk3 eor r4, r4, r7 // rtk <- tk2 ^ tk3 eor r14, r14, r8 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r9 // rtk <- tk2 ^ tk3 eor r3, r3, r10 // rtk <- tk2 ^ tk3 eor r4, r4, r11 // rtk <- tk2 ^ tk3 eor r14, r14, r12 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 loop_3: eor r12, r12, r6 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r12 // rtk <- tk2 ^ tk3 eor r3, r3, r5 // rtk <- tk2 ^ tk3 eor r4, r4, r6 // rtk <- tk2 ^ tk3 eor r14, r14, r7 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r8 // rtk <- tk2 ^ tk3 eor r3, r3, r9 // rtk <- tk2 ^ tk3 eor r4, r4, r10 // rtk <- tk2 ^ tk3 eor r14, r14, r11 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // if 0 then we are done eor r11, r11, r5 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r11 // rtk <- tk2 ^ tk3 eor r3, r3, r12 // rtk <- tk2 ^ tk3 eor r4, r4, r5 // rtk <- tk2 ^ tk3 eor r14, r14, r6 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r7 // rtk <- tk2 ^ tk3 eor r3, r3, r8 // rtk <- tk2 ^ tk3 eor r4, r4, r9 // rtk <- tk2 ^ tk3 eor r14, r14, r10 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // if 0 then we are done eor r10, r10, r12 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r10 // rtk <- tk2 ^ tk3 eor r3, r3, r11 // rtk <- tk2 ^ tk3 eor r4, r4, r12 // rtk <- tk2 ^ tk3 eor r14, r14, r5 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r6 // rtk <- tk2 ^ tk3 eor r3, r3, r7 // rtk <- tk2 ^ tk3 eor r4, r4, r8 // rtk <- tk2 ^ tk3 eor r14, r14, r9 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // if 0 then we are done eor r9, r9, r11 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r9 // rtk <- tk2 ^ tk3 eor r3, r3, r10 // rtk <- tk2 ^ tk3 eor r4, r4, r11 // rtk <- tk2 ^ tk3 eor r14, r14, r12 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r5 // rtk <- tk2 ^ tk3 eor r3, r3, r6 // rtk <- tk2 ^ tk3 eor r4, r4, r7 // rtk <- tk2 ^ tk3 eor r14, r14, r8 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // if 0 then we are done eor r8, r8, r10 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r8 // rtk <- tk2 ^ tk3 eor r3, r3, r9 // rtk <- tk2 ^ tk3 eor r4, r4, r10 // rtk <- tk2 ^ tk3 eor r14, r14, r11 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r12 // rtk <- tk2 ^ tk3 eor r3, r3, r5 // rtk <- tk2 ^ tk3 eor r4, r4, r6 // rtk <- tk2 ^ tk3 eor r14, r14, r7 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // if 0 then we are done eor r7, r7, r9 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r7 // rtk <- tk2 ^ tk3 eor r3, r3, r8 // rtk <- tk2 ^ tk3 eor r4, r4, r9 // rtk <- tk2 ^ tk3 eor r14, r14, r10 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r11 // rtk <- tk2 ^ tk3 eor r3, r3, r12 // rtk <- tk2 ^ tk3 eor r4, r4, r5 // rtk <- tk2 ^ tk3 eor r14, r14, r6 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // if 0 then we are done eor r6, r6, r8 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r6 // rtk <- tk2 ^ tk3 eor r3, r3, r7 // rtk <- tk2 ^ tk3 eor r4, r4, r8 // rtk <- tk2 ^ tk3 eor r14, r14, r9 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r10 // rtk <- tk2 ^ tk3 eor r3, r3, r11 // rtk <- tk2 ^ tk3 eor r4, r4, r12 // rtk <- tk2 ^ tk3 eor r14, r14, r5 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 2 beq exit_lfsr // if 0 then we are done eor r5, r5, r7 // apply LFSR3 to tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r5 // rtk <- tk2 ^ tk3 eor r3, r3, r6 // rtk <- tk2 ^ tk3 eor r4, r4, r7 // rtk <- tk2 ^ tk3 eor r14, r14, r8 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 ldm r0, {r2-r4,r14} // load rtk (computed by tkschedule_lfsr_2) eor r2, r2, r9 // rtk <- tk2 ^ tk3 eor r3, r3, r10 // rtk <- tk2 ^ tk3 eor r4, r4, r11 // rtk <- tk2 ^ tk3 eor r14, r14, r12 // rtk <- tk2 ^ tk3 stmia r0!, {r2-r4,r14} // store rtk after adding tk3 add r0, r0, #32 // same round tweakey every 2 rounds subs r1, r1, #2 // decrease loop counter by 8 bne loop_3 pop {r0-r12, r14} bx lr /****************************************************************************** * Compute TK = rearrange(perm(TK ^ TK1)) for all rounds. * The function 'rearrange' aims at reording bits for all round tweakeys to * match the fixsliced implementation of the SKINNY block cipher. ******************************************************************************/ @ void tkschedule_perm(u32* rtk) .global tkschedule_perm .type tkschedule_perm,%function .align 2 tkschedule_perm: push {r0-r12, r14} sub.w sp, #4 // to store 'lr' during subroutines movw r4, #0xf0f0 movt r4, #0xf0f0 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) and r5, r5, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r6, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r7, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r8, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r9, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r10, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r11, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r12, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x0000000c // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations mvn r10, r10 // to save 1 NOT in Sbox calculations mvn r6, r6 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x000000c0 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r11, r10, [r0], #8 strd r5, r6, [r0], #8 strd r8, r12, [r0], #8 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p2 // apply the permutation twice movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #26 and r2, r4, r12, ror #26 and r3, r4, r11, ror #26 mvn r1, r1 eor r2, r2, #0x00000300 eor r3, r3, #0x00000300 eor r3, r3, #0x30000000 mvn r3, r3 stmia.w r0!, {r1-r3} and r1, r4, r10, ror #26 and r2, r4, r6, ror #26 and r3, r4, r7, ror #26 mvn r1, r1 mvn r2, r2 mvn r3, r3 stmia.w r0!, {r1-r3} and r1, r4, r8, ror #26 and r2, r4, r5, ror #26 mvn r1, r1 stmia.w r0!, {r1-r2} and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #28 // --- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #12 and r1, r4, r6, ror #28 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #12 and r1, r4, r7, ror #28 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #12 and r1, r4, r8, ror #28 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #12 and r1, r4, r9, ror #28 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #12 and r1, r4, r10, ror #28 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #12 and r1, r4, r11, ror #28 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #12 and r1, r4, r12, ror #28 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #12 // ror and masks to match fixslicing --- mvn r8, r8 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x0c000000 // add rconst str.w r8, [r0], #4 stmia r0!, {r7,r9,r12} eor r10, r10, #0x0c000000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations str.w r10, [r0], #4 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0xcc000000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations stmia r0!, {r5,r6,r11} ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p4 // apply the permutation 4 times movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r5, ror #14 // --- ror and masks to match fixslicing and r2, r5, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r6, ror #14 and r3, r6, r4, ror #4 orr r3, r1, r3, ror #6 mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r7, ror #14 and r2, r7, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r8, ror #14 and r3, r8, r4, ror #4 orr r3, r1, r3, ror #6 mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r11, ror #14 and r2, r11, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r12, ror #14 and r3, r12, r4, ror #4 orr r3, r1, r3, ror #6 eor r2, r2, #0x00300000 // add rconst eor r2, r2, #0x00000003 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00300000 // add rconst strd r3, r2, [r0], #8 and r1, r4, r9, ror #14 and r2, r9, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r10, ror #14 and r3, r10, r4, ror #4 orr r3, r1, r3, ror #6 eor r2, r2, #0x00300000 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00300000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r4, r5, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r4, r6, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r4, r7, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r4, r8, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r4, r9, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r4, r10, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r4, r11, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r4, r12, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x00cc0000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations eor r10, r10, #0x00c00000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations mvn r6, r6 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00f00000 // add rconst eor r9, r9, #0x00c00000 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 strd r11, r10, [r0], #8 strd r5, r6, [r0], #8 strd r8, r12, [r0], #8 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p6 movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #10 // ror and mask to match fixslicing and r2, r4, r12, ror #10 // ror and mask to match fixslicing and r3, r4, r11, ror #10 // ror and mask to match fixslicing eor r1, r1, #0x03000000 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations eor r2, r2, #0x00c00000 // add rconst eor r3, r3, #0x03c00000 // add rconst eor r3, r3, #0x00003000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r10, ror #10 // ror and mask to match fixslicing and r2, r4, r6, ror #10 // ror and mask to match fixslicing and r3, r4, r7, ror #10 // ror and mask to match fixslicing eor r1, r1, #0x03000000 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r8, ror #10 // ror and mask to match fixslicing and r2, r4, r5, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #12 //--- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #28 and r1, r4, r6, ror #12 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #28 and r1, r4, r7, ror #12 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #28 and r1, r4, r8, ror #12 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #28 and r1, r4, r9, ror #12 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #28 and r1, r4, r10, ror #12 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #28 and r1, r4, r11, ror #12 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #28 and r1, r4, r12, ror #12 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #28 //ror and masks to match fixslicing --- mvn r7, r7 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 eor r9, r9, #0x00000c00 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00000c00 // add rconst eor r12, r12, #0x03000000 // add rconst strd r9, r12, [r0], #8 eor r10, r10, #0x00000c00 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0x0000c000 // add rconst eor r11, r11, #0x03000000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p8 movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r6, ror #30 and r2, r6, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r5, ror #30 and r2, r5, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r8, ror #30 and r2, r8, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r7, ror #30 and r2, r7, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r12, ror #30 and r2, r12, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r11, ror #30 and r2, r11, r4, ror #4 orr r3, r3, r2, ror #22 eor r1, r1, #0x00000c30 // add rconst eor r3, r3, #0x00000c30 // add rconst eor r3, r3, #0x00030000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r10, ror #30 and r2, r10, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r9, ror #30 and r2, r9, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00000030 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r5, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r6, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r7, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r8, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r9, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r10, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r11, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r12, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x000000fc // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations eor r10, r10, #0x000000c0 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations mvn r6, r6 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x000000f0 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r11, r10, [r0], #8 strd r5, r6, [r0], #8 strd r8, r12, [r0], #8 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p10 movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #26 and r2, r4, r12, ror #26 eor r1, r1, #0x00000300 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations eor r2, r2, #0x00000300 // add rconst strd r1, r2, [r0], #8 and r1, r4, r11, ror #26 and r2, r4, r10, ror #26 eor r1, r1, #0x30000000 // add rconst eor r1, r1, #0x000003c0 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations eor r2, r2, #0x00000300 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r1, r4, r6, ror #26 and r2, r4, r7, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r1, r4, r8, ror #26 and r2, r4, r5, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #28 // --- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #12 and r1, r4, r6, ror #28 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #12 and r1, r4, r7, ror #28 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #12 and r1, r4, r8, ror #28 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #12 and r1, r4, r9, ror #28 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #12 and r1, r4, r10, ror #28 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #12 and r1, r4, r11, ror #28 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #12 and r1, r4, r12, ror #28 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #12 // ror and masks to match fixslicing --- mvn r8, r8 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 eor r9, r9, #0x0c000000 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00000300 // add rconst strd r9, r12, [r0], #8 eor r10, r10, #0x0c000000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0xcc000000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p12 movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r5, ror #14 // --- ror and masks to match fixslicing and r2, r5, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r6, ror #14 and r3, r6, r4, ror #4 orr r3, r1, r3, ror #6 mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r7, ror #14 and r2, r7, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r8, ror #14 and r3, r8, r4, ror #4 orr r3, r1, r3, ror #6 mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r11, ror #14 and r2, r11, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r12, ror #14 and r3, r12, r4, ror #4 orr r3, r1, r3, ror #6 eor r3, r3, #0x0c000000 // add rconst eor r2, r2, #0x00000003 // add rconst eor r2, r2, #0x0c000000 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r9, ror #14 and r2, r9, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r10, ror #14 and r3, r10, r4, ror #4 orr r3, r1, r3, ror #6 eor r2, r2, #0x00300000 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00300000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r4, r5, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r4, r6, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r4, r7, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r4, r8, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r4, r9, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r4, r10, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r4, r11, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r4, r12, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x003c0000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations mvn r10, r10 // to save 1 NOT in Sbox calculations strd r11, r10, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations strd r5, r6, [r0], #8 mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00f00000 // add rconst strd r8, r12, [r0], #8 eor r9, r9, #0x00c00000 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p14 movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #10 // ror and mask to match fixslicing and r2, r4, r12, ror #10 // ror and mask to match fixslicing and r3, r4, r11, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations eor r2, r2, #0x03c00000 // add rconst eor r3, r3, #0x03c00000 // add rconst eor r3, r3, #0x00003000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r10, ror #10 // ror and mask to match fixslicing and r2, r4, r6, ror #10 // ror and mask to match fixslicing and r3, r4, r7, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r8, ror #10 // ror and mask to match fixslicing and r2, r4, r5, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #12 //--- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #28 and r1, r4, r6, ror #12 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #28 and r1, r4, r7, ror #12 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #28 and r1, r4, r8, ror #12 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #28 and r1, r4, r9, ror #12 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #28 and r1, r4, r10, ror #12 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #28 and r1, r4, r11, ror #12 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #28 and r1, r4, r12, ror #12 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #28 //ror and masks to match fixslicing --- mvn r7, r7 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00000c00 // add rconst strd r9, r12, [r0], #8 eor r10, r10, #0x00000c00 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0x0000cc00 // add rconst eor r11, r11, #0x03000000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r6, ror #30 and r2, r6, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r5, ror #30 and r2, r5, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r8, ror #30 and r2, r8, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r7, ror #30 and r2, r7, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r12, ror #30 and r2, r12, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r11, ror #30 and r2, r11, r4, ror #4 orr r3, r3, r2, ror #22 eor r3, r3, #0x00000030 // add rconst eor r3, r3, #0x00030000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r10, ror #30 and r2, r10, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r9, ror #30 and r2, r9, r4, ror #4 orr r3, r3, r2, ror #22 eor r1, r1, #0x00000030 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00000030 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r5, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r6, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r7, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r8, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r9, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r10, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r11, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r12, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x0000000c // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations eor r10, r10, #0x000000c0 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r11, r10, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations strd r5, r6, [r0], #8 mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x000000f0 // add rconst strd r8, r12, [r0], #8 eor r9, r9, #0x000000c0 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p2 // apply the permutation twice movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #26 and r2, r4, r12, ror #26 and r3, r4, r11, ror #26 eor r1, r1, #0x00000300 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations eor r2, r2, #0x000000c0 // add rconst eor r3, r3, #0x000003c0 // add rconst eor r3, r3, #0x30000000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r10, ror #26 and r2, r4, r6, ror #26 and r3, r4, r7, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r8, ror #26 and r2, r4, r5, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r2} and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #28 // --- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #12 and r1, r4, r6, ror #28 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #12 and r1, r4, r7, ror #28 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #12 and r1, r4, r8, ror #28 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #12 and r1, r4, r9, ror #28 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #12 and r1, r4, r10, ror #28 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #12 and r1, r4, r11, ror #28 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #12 and r1, r4, r12, ror #28 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #12 // ror and masks to match fixslicing --- mvn r8, r8 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x0c000000 // add rconst eor r12, r12, #0x00000300 // add rconst strd r9, r12, [r0], #8 eor r10, r10, #0x0c000000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0xc0000000 // add rconst eor r11, r11, #0x00000300 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p4 // apply the permutation 4 times movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r5, ror #14 // --- ror and masks to match fixslicing and r2, r5, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r6, ror #14 and r3, r6, r4, ror #4 orr r3, r1, r3, ror #6 mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r7, ror #14 and r2, r7, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r8, ror #14 and r3, r8, r4, ror #4 orr r3, r1, r3, ror #6 mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r11, ror #14 and r2, r11, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r12, ror #14 and r3, r12, r4, ror #4 orr r3, r1, r3, ror #6 eor r2, r2, #0x0c300000 // add rconst eor r2, r2, #0x00000003 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00300000 // add rconst strd r3, r2, [r0], #8 and r1, r4, r9, ror #14 and r2, r9, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r10, ror #14 and r3, r10, r4, ror #4 orr r3, r1, r3, ror #6 eor r2, r2, #0x00300000 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r4, r5, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r4, r6, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r4, r7, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r4, r8, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r4, r9, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r4, r10, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r4, r11, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r4, r12, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x00cc0000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations eor r10, r10, #0x00c00000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations mvn r6, r6 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00300000 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 strd r11, r10, [r0], #8 strd r5, r6, [r0], #8 strd r8, r12, [r0], #8 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p6 movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #10 // ror and mask to match fixslicing and r2, r4, r12, ror #10 // ror and mask to match fixslicing and r3, r4, r11, ror #10 // ror and mask to match fixslicing eor r1, r1, #0x03000000 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00c00000 // add rconst eor r3, r3, #0x00003000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r10, ror #10 // ror and mask to match fixslicing and r2, r4, r6, ror #10 // ror and mask to match fixslicing and r3, r4, r7, ror #10 // ror and mask to match fixslicing eor r1, r1, #0x03000000 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r8, ror #10 // ror and mask to match fixslicing and r2, r4, r5, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #12 //--- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #28 and r1, r4, r6, ror #12 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #28 and r1, r4, r7, ror #12 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #28 and r1, r4, r8, ror #12 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #28 and r1, r4, r9, ror #12 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #28 and r1, r4, r10, ror #12 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #28 and r1, r4, r11, ror #12 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #28 and r1, r4, r12, ror #12 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #28 //ror and masks to match fixslicing --- mvn r7, r7 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 eor r9, r9, #0x00000c00 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x03000000 // add rconst strd r9, r12, [r0], #8 mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0x0000c000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p8 movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r6, ror #30 and r2, r6, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r5, ror #30 and r2, r5, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r8, ror #30 and r2, r8, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r7, ror #30 and r2, r7, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r12, ror #30 and r2, r12, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r11, ror #30 and r2, r11, r4, ror #4 orr r3, r3, r2, ror #22 eor r1, r1, #0x00000c00 // add rconst eor r3, r3, #0x00030c00 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r10, ror #30 and r2, r10, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r9, ror #30 and r2, r9, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r5, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r6, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r7, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r8, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r9, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r10, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r11, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r12, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x0000003c // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations mvn r10, r10 // to save 1 NOT in Sbox calculations mvn r6, r6 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x000000c0 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r11, r10, [r0], #8 strd r5, r6, [r0], #8 strd r8, r12, [r0], #8 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p10 movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #26 and r2, r4, r12, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r1, r4, r11, ror #26 and r2, r4, r10, ror #26 eor r1, r1, #0x30000000 // add rconst eor r1, r1, #0x00000300 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r1, r4, r6, ror #26 and r2, r4, r7, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r1, r4, r8, ror #26 and r2, r4, r5, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #28 // --- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #12 and r1, r4, r6, ror #28 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #12 and r1, r4, r7, ror #28 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #12 and r1, r4, r8, ror #28 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #12 and r1, r4, r9, ror #28 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #12 and r1, r4, r10, ror #28 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #12 and r1, r4, r11, ror #28 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #12 and r1, r4, r12, ror #28 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #12 // ror and masks to match fixslicing --- mvn r8, r8 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x0c000000 // add rconst strd r9, r12, [r0], #8 eor r10, r10, #0x0c000000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0xc0000000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p12 movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r5, ror #14 // --- ror and masks to match fixslicing and r2, r5, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r6, ror #14 and r3, r6, r4, ror #4 orr r3, r1, r3, ror #6 mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r7, ror #14 and r2, r7, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r8, ror #14 and r3, r8, r4, ror #4 orr r3, r1, r3, ror #6 mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r11, ror #14 and r2, r11, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r12, ror #14 and r3, r12, r4, ror #4 orr r3, r1, r3, ror #6 eor r3, r3, #0x00300000 // add rconst eor r2, r2, #0x00000003 // add rconst eor r2, r2, #0x00300000 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r9, ror #14 and r2, r9, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r10, ror #14 and r3, r10, r4, ror #4 orr r3, r1, r3, ror #6 eor r2, r2, #0x00300000 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r4, r5, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r4, r6, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r4, r7, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r4, r8, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r4, r9, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r4, r10, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r4, r11, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r4, r12, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x00cc0000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations eor r10, r10, #0x00c00000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r11, r10, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations strd r5, r6, [r0], #8 mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00f00000 // add rconst strd r8, r12, [r0], #8 mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p14 movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #10 // ror and mask to match fixslicing and r2, r4, r12, ror #10 // ror and mask to match fixslicing and r3, r4, r11, ror #10 // ror and mask to match fixslicing eor r1, r1, #0x03000000 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations eor r3, r3, #0x03c00000 // add rconst eor r3, r3, #0x00003000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r10, ror #10 // ror and mask to match fixslicing and r2, r4, r6, ror #10 // ror and mask to match fixslicing and r3, r4, r7, ror #10 // ror and mask to match fixslicing eor r1, r1, #0x03000000 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r8, ror #10 // ror and mask to match fixslicing and r2, r4, r5, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #12 //--- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #28 and r1, r4, r6, ror #12 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #28 and r1, r4, r7, ror #12 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #28 and r1, r4, r8, ror #12 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #28 and r1, r4, r9, ror #12 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #28 and r1, r4, r10, ror #12 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #28 and r1, r4, r11, ror #12 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #28 and r1, r4, r12, ror #12 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #28 //ror and masks to match fixslicing --- mvn r7, r7 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 eor r9, r9, #0x00000c00 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x03000000 // add rconst strd r9, r12, [r0], #8 eor r10, r10, #0x00000c00 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0x0000c000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r6, ror #30 and r2, r6, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r5, ror #30 and r2, r5, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r8, ror #30 and r2, r8, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r7, ror #30 and r2, r7, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r12, ror #30 and r2, r12, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r11, ror #30 and r2, r11, r4, ror #4 orr r3, r3, r2, ror #22 eor r1, r1, #0x00000c00 // add rconst eor r3, r3, #0x00030c00 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r10, ror #30 and r2, r10, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r9, ror #30 and r2, r9, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00000030 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r5, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r6, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r7, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r8, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r9, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r10, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r11, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r12, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x0000003c // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations mvn r10, r10 // to save 1 NOT in Sbox calculations strd r11, r10, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations strd r5, r6, [r0], #8 mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x000000f0 // add rconst strd r8, r12, [r0], #8 mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p2 // apply the permutation twice movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #26 and r2, r4, r12, ror #26 and r3, r4, r11, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations eor r2, r2, #0x00000300 // add rconst eor r3, r3, #0x000003c0 // add rconst eor r3, r3, #0x30000000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r10, ror #26 and r2, r4, r6, ror #26 and r3, r4, r7, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r8, ror #26 and r2, r4, r5, ror #26 mvn r1, r1 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r2} and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #28 // --- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #12 and r1, r4, r6, ror #28 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #12 and r1, r4, r7, ror #28 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #12 and r1, r4, r8, ror #28 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #12 and r1, r4, r9, ror #28 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #12 and r1, r4, r10, ror #28 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #12 and r1, r4, r11, ror #28 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #12 and r1, r4, r12, ror #28 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #12 // ror and masks to match fixslicing --- mvn r8, r8 // to save 1 NOT in Sbox calculations mvn r7, r7 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 mvn r9, r9 // to save 1 NOT in Sbox calculations strd r9, r12, [r0], #8 eor r10, r10, #0x0c000000 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0xcc000000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p4 // apply the permutation 4 times movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r5, ror #14 // --- ror and masks to match fixslicing and r2, r5, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r6, ror #14 and r3, r6, r4, ror #4 orr r3, r1, r3, ror #6 mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r7, ror #14 and r2, r7, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r8, ror #14 and r3, r8, r4, ror #4 orr r3, r1, r3, ror #6 mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r11, ror #14 and r2, r11, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r12, ror #14 and r3, r12, r4, ror #4 orr r3, r1, r3, ror #6 eor r3, r3, #0x00300000 // add rconst eor r2, r2, #0x00000003 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 and r1, r4, r9, ror #14 and r2, r9, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r10, ror #14 and r3, r10, r4, ror #4 orr r3, r1, r3, ror #6 eor r2, r2, #0x00300000 // add rconst mvn r2, r2 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00300000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r3, r2, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r5, r4, r5, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r6, r4, r6, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r7, r4, r7, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r8, r4, r8, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r9, r4, r9, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r10, r4, r10, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r11, r4, r11, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r12, r4, r12, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) eor r11, r11, #0x00cc0000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations mvn r10, r10 // to save 1 NOT in Sbox calculations mvn r6, r6 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00f00000 // add rconst eor r9, r9, #0x00c00000 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations mvn r7, r7 strd r11, r10, [r0], #8 strd r5, r6, [r0], #8 strd r8, r12, [r0], #8 strd r9, r7, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p6 movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #10 // ror and mask to match fixslicing and r2, r4, r12, ror #10 // ror and mask to match fixslicing and r3, r4, r11, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations eor r2, r2, #0x00c00000 // add rconst eor r3, r3, #0x03c00000 // add rconst eor r3, r3, #0x00003000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r10, ror #10 // ror and mask to match fixslicing and r2, r4, r6, ror #10 // ror and mask to match fixslicing and r3, r4, r7, ror #10 // ror and mask to match fixslicing eor r1, r1, #0x03000000 // add rconst mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r2, r2 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations stmia.w r0!, {r1-r3} and r1, r4, r8, ror #10 // ror and mask to match fixslicing and r2, r4, r5, ror #10 // ror and mask to match fixslicing mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r5, ror #12 //--- ror and masks to match fixslicing and r5, r5, r4, lsl #6 orr r5, r1, r5, ror #28 and r1, r4, r6, ror #12 and r6, r6, r4, lsl #6 orr r6, r1, r6, ror #28 and r1, r4, r7, ror #12 and r7, r7, r4, lsl #6 orr r7, r1, r7, ror #28 and r1, r4, r8, ror #12 and r8, r8, r4, lsl #6 orr r8, r1, r8, ror #28 and r1, r4, r9, ror #12 and r9, r9, r4, lsl #6 orr r9, r1, r9, ror #28 and r1, r4, r10, ror #12 and r10, r10, r4, lsl #6 orr r10, r1, r10, ror #28 and r1, r4, r11, ror #12 and r11, r11, r4, lsl #6 orr r11, r1, r11, ror #28 and r1, r4, r12, ror #12 and r12, r12, r4, lsl #6 orr r12, r1, r12, ror #28 //ror and masks to match fixslicing --- mvn r7, r7 // to save 1 NOT in Sbox calculations mvn r8, r8 // to save 1 NOT in Sbox calculations strd r8, r7, [r0], #8 eor r9, r9, #0x00000c00 // add rconst mvn r9, r9 // to save 1 NOT in Sbox calculations eor r12, r12, #0x00000c00 // add rconst strd r9, r12, [r0], #8 eor r10, r10, #0x00000c00 // add rconst mvn r10, r10 // to save 1 NOT in Sbox calculations strd r10, r5, [r0], #8 mvn r6, r6 // to save 1 NOT in Sbox calculations eor r11, r11, #0x03000000 // add rconst eor r11, r11, #0x0000c000 // add rconst mvn r11, r11 // to save 1 NOT in Sbox calculations strd r6, r11, [r0], #8 ldm r0, {r5-r12} // load rtk = tk1 ^ lfsr2(tk2) ^ lfsr3(tk3) bl p8 movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r6, ror #30 and r2, r6, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r5, ror #30 and r2, r5, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r8, ror #30 and r2, r8, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r7, ror #30 and r2, r7, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r12, ror #30 and r2, r12, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r11, ror #30 and r2, r11, r4, ror #4 orr r3, r3, r2, ror #22 eor r1, r1, #0x00000c00 // add rconst eor r3, r3, #0x00000030 // add rconst eor r3, r3, #0x00030000 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 and r1, r4, r10, ror #30 and r2, r10, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r9, ror #30 and r2, r9, r4, ror #4 orr r3, r3, r2, ror #22 mvn r1, r1 // to save 1 NOT in Sbox calculations eor r3, r3, #0x00000030 // add rconst mvn r3, r3 // to save 1 NOT in Sbox calculations strd r1, r3, [r0], #8 add.w sp, #4 pop {r0-r12, r14} bx lr /****************************************************************************** * Compute TK = rearrange(perm(TK ^ TK1)) for all rounds. * The function 'rearrange' aims at reording bits for all round tweakeys to * match the fixsliced implementation of the SKINNY block cipher. ******************************************************************************/ @ void tkschedule_perm_tk1(u32* rtk) .global tkschedule_perm_tk1 .type tkschedule_perm_tk1,%function .align 2 tkschedule_perm_tk1: push {r0-r12, r14} sub.w sp, #32 // to store packed tk1 ldm r1, {r5,r7,r9,r11} // load the 1st block in r5,r7,r9,r11 ldm r2, {r6,r8,r10,r12} // load the 1st block in r5,r7,r9,r11 movw r2, #0x5555 movt r2, #0x5555 // r2 <- 0x55555555 movw r3, #0x3030 movt r3, #0x3030 // r3 <- 0x30303030 bl packing stm sp, {r5-r12} movw r4, #0xf0f0 movt r4, #0xf0f0 and r1, r11, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r10, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r5, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r6, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r8, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r12, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r9, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r7, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) strd r1, r2, [r0], #8 bl p2 // apply the permutation twice movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #26 and r2, r4, r12, ror #26 and r3, r4, r11, ror #26 stmia.w r0!, {r1-r3} and r1, r4, r10, ror #26 and r2, r4, r6, ror #26 and r3, r4, r7, ror #26 stmia.w r0!, {r1-r3} and r1, r4, r8, ror #26 and r2, r4, r5, ror #26 stmia.w r0!, {r1-r2} and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r8, ror #28 // --- ror and masks to match fixslicing and r2, r8, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r7, ror #28 and r3, r7, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 and r1, r4, r9, ror #28 and r2, r9, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r12, ror #28 and r3, r12, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 and r1, r4, r10, ror #28 and r2, r10, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r5, ror #28 and r3, r5, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 and r1, r4, r6, ror #28 and r2, r6, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r11, ror #28 and r3, r11, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 bl p2 // apply the permutation twice movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r5, ror #14 // --- ror and masks to match fixslicing and r2, r5, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r6, ror #14 and r3, r6, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 and r1, r4, r7, ror #14 and r2, r7, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r8, ror #14 and r3, r8, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 and r1, r4, r11, ror #14 and r2, r11, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r12, ror #14 and r3, r12, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 and r1, r4, r9, ror #14 and r2, r9, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r10, ror #14 and r3, r10, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r1, r4, r11, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r4, r10, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r4, r5, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r4, r6, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r4, r8, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r4, r12, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r4, r9, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r4, r7, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) strd r1, r2, [r0], #8 bl p2 // apply the permutation twice movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #10 // ror and mask to match fixslicing and r2, r4, r12, ror #10 // ror and mask to match fixslicing and r3, r4, r11, ror #10 // ror and mask to match fixslicing stmia r0!, {r1-r3} and r1, r4, r10, ror #10 // ror and mask to match fixslicing and r2, r4, r6, ror #10 // ror and mask to match fixslicing and r3, r4, r7, ror #10 // ror and mask to match fixslicing stmia r0!, {r1-r3} and r1, r4, r8, ror #10 // ror and mask to match fixslicing and r2, r4, r5, ror #10 // ror and mask to match fixslicing strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r8, ror #12 // --- ror and masks to match fixslicing and r2, r8, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r7, ror #12 and r3, r7, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 and r1, r4, r9, ror #12 and r2, r9, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r12, ror #12 and r3, r12, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 and r1, r4, r10, ror #12 and r2, r10, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r5, ror #12 and r3, r5, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 and r1, r4, r6, ror #12 and r2, r6, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r11, ror #12 and r3, r11, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 bl p2 // apply the permutation twice movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r6, ror #30 and r2, r6, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r5, ror #30 and r2, r5, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0], #8 and r1, r4, r8, ror #30 and r2, r8, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r7, ror #30 and r2, r7, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0], #8 and r1, r4, r12, ror #30 and r2, r12, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r11, ror #30 and r2, r11, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0], #8 and r1, r4, r10, ror #30 and r2, r10, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r9, ror #30 and r2, r9, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r1, r11, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r10, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r5, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r6, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r8, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r12, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r9, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r7, r4 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) strd r1, r2, [r0], #8 bl p2 // apply the permutation twice movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #26 and r2, r4, r12, ror #26 strd r1, r2, [r0], #8 and r1, r4, r11, ror #26 and r2, r4, r10, ror #26 strd r1, r2, [r0], #8 and r1, r4, r6, ror #26 and r2, r4, r7, ror #26 strd r1, r2, [r0], #8 and r1, r4, r8, ror #26 and r2, r4, r5, ror #26 strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r8, ror #28 // --- ror and masks to match fixslicing and r2, r8, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r7, ror #28 and r3, r7, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 and r1, r4, r9, ror #28 and r2, r9, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r12, ror #28 and r3, r12, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 and r1, r4, r10, ror #28 and r2, r10, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r5, ror #28 and r3, r5, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 and r1, r4, r6, ror #28 and r2, r6, r4, lsl #6 orr r2, r1, r2, ror #12 and r1, r4, r11, ror #28 and r3, r11, r4, lsl #6 orr r3, r1, r3, ror #12 strd r2, r3, [r0], #8 bl p2 // apply the permutation twice movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r5, ror #14 // --- ror and masks to match fixslicing and r2, r5, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r6, ror #14 and r3, r6, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 and r1, r4, r7, ror #14 and r2, r7, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r8, ror #14 and r3, r8, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 and r1, r4, r11, ror #14 and r2, r11, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r12, ror #14 and r3, r12, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 and r1, r4, r9, ror #14 and r2, r9, r4, ror #4 orr r2, r1, r2, ror #6 and r1, r4, r10, ror #14 and r3, r10, r4, ror #4 orr r3, r1, r3, ror #6 strd r3, r2, [r0], #8 orr r4, r4, r4, lsl #2 // r4 <- 0xf0f0f0f0 and r1, r4, r11, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r4, r10, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r4, r5, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r4, r6, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r4, r8, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r3, r4, r12, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) stmia r0!, {r1-r3} and r1, r4, r9, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) and r2, r4, r7, ror #16 // tk &= 0xf0f0f0f0 (extract rows 1&2 only) strd r1, r2, [r0], #8 bl p2 // apply the permutation twice movw r4, #0xc3c3 movt r4, #0xc3c3 // r4 <- 0xc3c3c3c3 and r1, r4, r9, ror #10 // ror and mask to match fixslicing and r2, r4, r12, ror #10 // ror and mask to match fixslicing and r3, r4, r11, ror #10 // ror and mask to match fixslicing stmia.w r0!, {r1-r3} and r1, r4, r10, ror #10 // ror and mask to match fixslicing and r2, r4, r6, ror #10 // ror and mask to match fixslicing and r3, r4, r7, ror #10 // ror and mask to match fixslicing stmia.w r0!, {r1-r3} and r1, r4, r8, ror #10 // ror and mask to match fixslicing and r2, r4, r5, ror #10 // ror and mask to match fixslicing strd r1, r2, [r0], #8 and r4, r4, r4, lsr #6 // r4 <- 0x03030303 and r1, r4, r8, ror #12 // --- ror and masks to match fixslicing and r2, r8, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r7, ror #12 and r3, r7, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 and r1, r4, r9, ror #12 and r2, r9, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r12, ror #12 and r3, r12, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 and r1, r4, r10, ror #12 and r2, r10, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r5, ror #12 and r3, r5, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 and r1, r4, r6, ror #12 and r2, r6, r4, lsl #6 orr r2, r1, r2, ror #28 and r1, r4, r11, ror #12 and r3, r11, r4, lsl #6 orr r3, r1, r3, ror #28 strd r2, r3, [r0], #8 ldmia.w sp!, {r5-r12} movw r4, #0x3030 movt r4, #0x3030 // r4 <- 0x30303030 and r1, r4, r6, ror #30 and r2, r6, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r5, ror #30 and r2, r5, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0], #8 and r1, r4, r8, ror #30 and r2, r8, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r7, ror #30 and r2, r7, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0], #8 and r1, r4, r12, ror #30 and r2, r12, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r11, ror #30 and r2, r11, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0], #8 and r1, r4, r10, ror #30 and r2, r10, r4, ror #4 orr r1, r1, r2, ror #22 and r3, r4, r9, ror #30 and r2, r9, r4, ror #4 orr r3, r3, r2, ror #22 strd r1, r3, [r0] pop {r0-r12, r14} bx lr /****************************************************************************** * Quadruple round of the SKINNY block cipher in a bitsliced manner. ******************************************************************************/ .align 2 quadruple_round: str.w r14, [sp] // store r14 on the stack orr r4, r5, r6 // state[0] | state[1] eor r8, r8, r4 // state[3] ^= (state[0] | state[1]) orr r4, r9, r10 // state[4] | state[5] eor r12, r12, r4 // state[7] ^= (state[4] | state[5]) orr r4, r11, r10 // state[6] | state[5] eor r6, r6, r4 // state[1] ^= (state[6] | state[5]) and r4, r8, r12 // state[3] & state[7] eor r7, r7, r4 // state[2] ^= (state[3] & state[7]) orn r4, r9, r12 // ~state[7] | state[4] eor r11, r11, r4 // state[6] ^= (~state[7] | state[4]) orn r4, r7, r6 // state[2] | ~state[1] eor r5, r5, r4 // state[0] ^= (state[2] | ~state[1]) orn r4, r7, r8 // ~state[3] | state[2] eor r9, r9, r4 // state[4] ^= (~state[3] | state[2]) and r4, r5, r11 // state[0] & state[6] eor r10, r10, r4 // state[5] ^= (state[6] & state[0]) ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldmia.w r0!, {r2-r4,r14} // load rtk_1 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_1 eor r6, r6, r3 // add rtk_1 eor r7, r7, r4 // add rtk_1 eor r8, r8, r14 // add rtk_1 ldmia.w r0!, {r2-r4, r14} // load rtk_1 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_1 eor r10, r10, r3 // add rtk_1 eor r11, r11, r4 // add rtk_1 eor r12, r12, r14 // add rtk_1 movw r2, #0x3030 movt r2, #0x3030 // r2 <- 0x30303030 and r4, r2, r5, ror #30 // --- mixcolumns eor r5, r5, r4, ror #24 and r4, r2, r5, ror #18 eor r5, r5, r4, ror #2 and r4, r2, r5, ror #6 eor r5, r5, r4, ror #4 and r4, r2, r6, ror #30 eor r6, r6, r4, ror #24 and r4, r2, r6, ror #18 eor r6, r6, r4, ror #2 and r4, r2, r6, ror #6 eor r6, r6, r4, ror #4 and r4, r2, r7, ror #30 eor r7, r7, r4, ror #24 and r4, r2, r7, ror #18 eor r7, r7, r4, ror #2 and r4, r2, r7, ror #6 eor r7, r7, r4, ror #4 and r4, r2, r8, ror #30 eor r8, r8, r4, ror #24 and r4, r2, r8, ror #18 eor r8, r8, r4, ror #2 and r4, r2, r8, ror #6 eor r8, r8, r4, ror #4 and r4, r2, r9, ror #30 eor r9, r9, r4, ror #24 and r4, r2, r9, ror #18 eor r9, r9, r4, ror #2 and r4, r2, r9, ror #6 eor r9, r9, r4, ror #4 and r4, r2, r10, ror #30 eor r10, r10, r4, ror #24 and r4, r2, r10, ror #18 eor r10, r10, r4, ror #2 and r4, r2, r10, ror #6 eor r10, r10, r4, ror #4 and r4, r2, r11, ror #30 eor r11, r11, r4, ror #24 and r4, r2, r11, ror #18 eor r11, r11, r4, ror #2 and r4, r2, r11, ror #6 eor r11, r11, r4, ror #4 and r4, r2, r12, ror #30 eor r12, r12, r4, ror #24 and r4, r2, r12, ror #18 eor r12, r12, r4, ror #2 and r4, r2, r12, ror #6 eor r12, r12, r4, ror #4 // mixcolumns --- orr r4, r7, r8 // state[2] | state[3] eor r9, r9, r4 // state[4] ^= (state[2] | state[3]) orr r4, r6, r11 // state[1] | state[6] eor r10, r10, r4 // state[5] ^= (state[6] | state[1]) orr r4, r5, r6 // state[0] | state[1] eor r8, r8, r4 // state[3] ^= (state[0] | state[1]) and r4, r9, r10 // state[4] & state[5] eor r12, r12, r4 // state[7] ^= (state[4] & state[5]) orn r4, r11, r10 // ~state[5] | state[6] eor r5, r5, r4 // state[0] ^= (~state[5] | state[6]) orn r4, r12, r8 // state[7] | ~state[3] eor r7, r7, r4 // state[2] ^= (state[7] | ~state[3]) orn r4, r12, r9 // state[7] | ~state[4] eor r11, r11, r4 // state[6] ^= (~state[4] | state[7]) and r4, r5, r7 // state[0] & state[2] eor r6, r6, r4 // state[1] ^= (state[0] & state[2]) ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldmia.w r0!, {r2-r4,r14} // load rtk_1 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_1 eor r6, r6, r3 // add rtk_1 eor r7, r7, r4 // add rtk_1 eor r8, r8, r14 // add rtk_1 ldmia.w r0!, {r2-r4, r14} // load rtk_1 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_1 eor r10, r10, r3 // add rtk_1 eor r11, r11, r4 // add rtk_1 eor r12, r12, r14 // add rtk_1 movw r2, #0x3030 movt r2, #0x3030 //r2 <- 0x30303030 and r4, r2, r5, ror #16 // --- mixcolumns eor r5, r5, r4, ror #30 and r4, r2, r5, ror #28 eor r5, r5, r4 and r4, r2, r5, ror #16 eor r5, r5, r4, ror #2 and r4, r2, r6, ror #16 eor r6, r6, r4, ror #30 and r4, r2, r6, ror #28 eor r6, r6, r4 and r4, r2, r6, ror #16 eor r6, r6, r4, ror #2 and r4, r2, r7, ror #16 eor r7, r7, r4, ror #30 and r4, r2, r7, ror #28 eor r7, r7, r4 and r4, r2, r7, ror #16 eor r7, r7, r4, ror #2 and r4, r2, r8, ror #16 eor r8, r8, r4, ror #30 and r4, r2, r8, ror #28 eor r8, r8, r4 and r4, r2, r8, ror #16 eor r8, r8, r4, ror #2 and r4, r2, r9, ror #16 eor r9, r9, r4, ror #30 and r4, r2, r9, ror #28 eor r9, r9, r4 and r4, r2, r9, ror #16 eor r9, r9, r4, ror #2 and r4, r2, r10, ror #16 eor r10, r10, r4, ror #30 and r4, r2, r10, ror #28 eor r10, r10, r4 and r4, r2, r10, ror #16 eor r10, r10, r4, ror #2 and r4, r2, r11, ror #16 eor r11, r11, r4, ror #30 and r4, r2, r11, ror #28 eor r11, r11, r4 and r4, r2, r11, ror #16 eor r11, r11, r4, ror #2 and r4, r2, r12, ror #16 eor r12, r12, r4, ror #30 and r4, r2, r12, ror #28 eor r12, r12, r4 and r4, r2, r12, ror #16 eor r12, r12, r4, ror #2 // mixcolumns --- orr r4, r12, r9 // state[7] | state[4] eor r11, r11, r4 // state[6] ^= (state[7] | state[4]) orr r4, r5, r8 // state[0] | state[3] eor r6, r6, r4 // state[1] ^= (state[0] | state[3]) orr r4, r7, r8 // state[2] | state[3] eor r9, r9, r4 // state[4] ^= (state[2] | state[3]) and r4, r6, r11 // state[1] & state[6] eor r10, r10, r4 // state[5] ^= (state[6] & state[1]) orn r4, r5, r6 // ~state[1] | state[0] eor r7, r7, r4 // state[2] ^= (~state[1] | state[0]) orn r4, r10, r9 // state[5] | ~state[4] eor r12, r12, r4 // state[7] ^= (state[5] | ~state[4]) orn r4, r10, r11 // ~state[6] | state[5] eor r5, r5, r4 // state[0] ^= (~state[6] | state[5]) and r4, r7, r12 // state[2] & state[7] eor r8, r8, r4 // state[3] ^= (state[2] & state[7]) ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldmia.w r0!, {r2-r4,r14} // load rtk_1 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_1 eor r6, r6, r3 // add rtk_1 eor r7, r7, r4 // add rtk_1 eor r8, r8, r14 // add rtk_1 ldmia.w r0!, {r2-r4, r14} // load rtk_1 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_1 eor r10, r10, r3 // add rtk_1 eor r11, r11, r4 // add rtk_1 eor r12, r12, r14 // add rtk_1 movw r2, #0x3030 movt r2, #0x3030 // r2 <- 0x30303030 and r4, r2, r5, ror #10 // --- mixcolumns eor r5, r5, r4, ror #4 and r4, r2, r5, ror #6 eor r5, r5, r4, ror #6 and r4, r2, r5, ror #26 eor r5, r5, r4 and r4, r2, r6, ror #10 eor r6, r6, r4, ror #4 and r4, r2, r6, ror #6 eor r6, r6, r4, ror #6 and r4, r2, r6, ror #26 eor r6, r6, r4 and r4, r2, r7, ror #10 eor r7, r7, r4, ror #4 and r4, r2, r7, ror #6 eor r7, r7, r4, ror #6 and r4, r2, r7, ror #26 eor r7, r7, r4 and r4, r2, r8, ror #10 eor r8, r8, r4, ror #4 and r4, r2, r8, ror #6 eor r8, r8, r4, ror #6 and r4, r2, r8, ror #26 eor r8, r8, r4 and r4, r2, r9, ror #10 eor r9, r9, r4, ror #4 and r4, r2, r9, ror #6 eor r9, r9, r4, ror #6 and r4, r2, r9, ror #26 eor r9, r9, r4 and r4, r2, r10, ror #10 eor r10, r10, r4, ror #4 and r4, r2, r10, ror #6 eor r10, r10, r4, ror #6 and r4, r2, r10, ror #26 eor r10, r10, r4 and r4, r2, r11, ror #10 eor r11, r11, r4, ror #4 and r4, r2, r11, ror #6 eor r11, r11, r4, ror #6 and r4, r2, r11, ror #26 eor r11, r11, r4 and r4, r2, r12, ror #10 eor r12, r12, r4, ror #4 and r4, r2, r12, ror #6 eor r12, r12, r4, ror #6 and r4, r2, r12, ror #26 eor r12, r12, r4 // mixcolumns --- orr r4, r10, r11 // state[5] | state[6] eor r5, r5, r4 // state[0] ^= (state[5] | state[6]) orr r4, r7, r9 // state[2] | state[4] eor r8, r8, r4 // state[3] ^= (state[2] | state[4]) orr r4, r9, r12 // state[7] | state[4] eor r11, r11, r4 // state[6] ^= (state[7] | state[4]) and r4, r5, r8 // state[0] & state[3] eor r6, r6, r4 // state[1] ^= (state[0] & state[3]) orn r4, r7, r8 // ~state[3] | state[2] eor r12, r12, r4 // state[7] ^= (~state[3] | state[2]) orn r4, r6, r11 // state[1] | ~state[6] eor r10, r10, r4 // state[5] ^= (state[1] | ~state[6]) orn r4, r6, r5 // ~state[0] | state[1] eor r7, r7, r4 // state[2] ^= (~state[0] | state[1]) and r4, r12, r10 // state[7] & state[5] eor r9, r9, r4 // state[4] ^= (state[7] & state[5]) ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldmia.w r1!, {r2-r4,r14} // load rtk_2_3 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldmia.w r0!, {r2-r4,r14} // load rtk_1 in r0,r2,r3,r4 eor r5, r5, r2 // add rtk_1 eor r6, r6, r3 // add rtk_1 eor r7, r7, r4 // add rtk_1 eor r8, r8, r14 // add rtk_1 ldmia.w r0!, {r2-r4, r14} // load rtk_1 in r0,r2,r3,r4 eor r9, r9, r2 // add rtk_1 eor r10, r10, r3 // add rtk_1 eor r11, r11, r4 // add rtk_1 eor r12, r12, r14 // add rtk_1 movw r2, #0x3030 movt r2, #0x3030 // r2 <- 0x30303030 and r4, r2, r5, ror #4 // --- mixcolumns eor r5, r5, r4, ror #26 and r4, r2, r5 eor r5, r5, r4, ror #4 and r4, r2, r5, ror #4 eor r5, r5, r4, ror #22 and r4, r2, r6, ror #4 eor r6, r6, r4, ror #26 and r4, r2, r6 eor r6, r6, r4, ror #4 and r4, r2, r6, ror #4 eor r6, r6, r4, ror #22 and r4, r2, r7, ror #4 eor r7, r7, r4, ror #26 and r4, r2, r7 eor r7, r7, r4, ror #4 and r4, r2, r7, ror #4 eor r7, r7, r4, ror #22 and r4, r2, r8, ror #4 eor r8, r8, r4, ror #26 and r4, r2, r8 eor r8, r8, r4, ror #4 and r4, r2, r8, ror #4 eor r8, r8, r4, ror #22 and r4, r2, r9, ror #4 eor r9, r9, r4, ror #26 and r4, r2, r9 eor r9, r9, r4, ror #4 and r4, r2, r9, ror #4 eor r9, r9, r4, ror #22 and r4, r2, r10, ror #4 eor r10, r10, r4, ror #26 and r4, r2, r10 eor r10, r10, r4, ror #4 and r4, r2, r10, ror #4 eor r10, r10, r4, ror #22 and r4, r2, r11, ror #4 eor r11, r11, r4, ror #26 and r4, r2, r11 eor r11, r11, r4, ror #4 and r4, r2, r11, ror #4 eor r11, r11, r4, ror #22 and r4, r2, r12, ror #4 eor r12, r12, r4, ror #26 and r4, r2, r12 eor r12, r12, r4, ror #4 and r4, r2, r12, ror #4 eor r12, r12, r4, ror #22 // mixcolumns --- // renaming slices for the sbox calculations // can be avoided with an octuple_round routine=>increase of the code size ldr.w r14, [sp] // restore link register eor r5, r5, r6 // --- swap state[0] with state[1] eor r6, r6, r5 eor r5, r5, r6 // swap state[0] with state[1] --- eor r7, r7, r8 // --- swap state[2] with state[3] eor r8, r8, r7 eor r7, r7, r8 // swap state[2] with state[3] --- eor r9, r9, r12 // --- swap state[4] with state[7] eor r12, r12, r9 eor r9, r9, r12 // swap state[4] with state[7] --- eor r11, r11, r10 // --- swap state[6] with state[5] eor r10, r10, r11 eor r11, r11, r10 // swap state[6] with state[5] --- bx lr /****************************************************************************** * Inverse quadruple round of fixsliced SKINNY-128 tweakable block cipher. * The 2 blocks are stored in r5-r12 (fixsliced representation). ******************************************************************************/ .align 2 inv_quadruple_round: str.w r14, [sp] // store r14 on the stack eor r5, r5, r6 // --- swap state[0] with state[1] eor r6, r6, r5 eor r5, r5, r6 // swap state[0] with state[1] --- eor r7, r7, r8 // --- swap state[2] with state[3] eor r8, r8, r7 eor r7, r7, r8 // swap state[2] with state[3] --- eor r9, r9, r12 // --- swap state[4] with state[7] eor r12, r12, r9 eor r9, r9, r12 // swap state[4] with state[7] --- eor r11, r11, r10 // --- swap state[6] with state[5] eor r10, r10, r11 eor r11, r11, r10 // swap state[6] with state[5] --- movw r2, #0x3030 movt r2, #0x3030 // mask for ininv_mixcolumns and r4, r2, r5, ror #4 // --- inv_mixcolumns_3 eor r5, r5, r4, ror #22 and r4, r2, r5 eor r5, r5, r4, ror #4 and r4, r2, r5, ror #4 eor r5, r5, r4, ror #26 and r4, r2, r6, ror #4 eor r6, r6, r4, ror #22 and r4, r2, r6 eor r6, r6, r4, ror #4 and r4, r2, r6, ror #4 eor r6, r6, r4, ror #26 and r4, r2, r7, ror #4 eor r7, r7, r4, ror #22 and r4, r2, r7 eor r7, r7, r4, ror #4 and r4, r2, r7, ror #4 eor r7, r7, r4, ror #26 and r4, r2, r8, ror #4 eor r8, r8, r4, ror #22 and r4, r2, r8 eor r8, r8, r4, ror #4 and r4, r2, r8, ror #4 eor r8, r8, r4, ror #26 and r4, r2, r9, ror #4 eor r9, r9, r4, ror #22 and r4, r2, r9 eor r9, r9, r4, ror #4 and r4, r2, r9, ror #4 eor r9, r9, r4, ror #26 and r4, r2, r10, ror #4 eor r10, r10, r4, ror #22 and r4, r2, r10 eor r10, r10, r4, ror #4 and r4, r2, r10, ror #4 eor r10, r10, r4, ror #26 and r4, r2, r11, ror #4 eor r11, r11, r4, ror #22 and r4, r2, r11 eor r11, r11, r4, ror #4 and r4, r2, r11, ror #4 eor r11, r11, r4, ror #26 and r4, r2, r12, ror #4 eor r12, r12, r4, ror #22 and r4, r2, r12 eor r12, r12, r4, ror #4 and r4, r2, r12, ror #4 eor r12, r12, r4, ror #26 // inv_mixcolumns_3 --- ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r0], #-8 // load rtk1 eor r11, r11, r4 // add rtk1 eor r12, r12, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r9, r9, r2 // add rtk1 eor r10, r10, r3 // add rtk1 ldrd r4, r14, [r0], #-8 // load rtk1 eor r7, r7, r4 // add rtk1 eor r8, r8, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r5, r5, r2 // add rtk1 eor r6, r6, r3 // add rtk1 and r4, r12, r10 // state[7] & state[5] eor r9, r9, r4 // state[4] ^= (state[7] & state[5]) orn r4, r6, r5 // ~state[0] | state[1] eor r7, r7, r4 // state[2] ^= (~state[0] | state[1]) orn r4, r6, r11 // state[1] | ~state[6] eor r10, r10, r4 // state[5] ^= (state[1] | ~state[6]) orn r4, r7, r8 // ~state[3] | state[2] eor r12, r12, r4 // state[7] ^= (~state[3] | state[2]) and r4, r5, r8 // state[0] & state[3] eor r6, r6, r4 // state[1] ^= (state[0] & state[3]) orr r4, r9, r12 // state[7] | state[4] eor r11, r11, r4 // state[6] ^= (state[7] | state[4]) orr r4, r7, r9 // state[2] | state[4] eor r8, r8, r4 // state[3] ^= (state[2] | state[4]) orr r4, r10, r11 // state[5] | state[6] eor r5, r5, r4 // state[0] ^= (state[5] | state[6]) movw r2, #0x3030 movt r2, #0x3030 // mask for inv_mixcolumns and r4, r2, r5, ror #26 // --- inv_mixcolumns_2 eor r5, r5, r4 and r4, r2, r5, ror #6 eor r5, r5, r4, ror #6 and r4, r2, r5, ror #10 eor r5, r5, r4, ror #4 and r4, r2, r6, ror #26 eor r6, r6, r4 and r4, r2, r6, ror #6 eor r6, r6, r4, ror #6 and r4, r2, r6, ror #10 eor r6, r6, r4, ror #4 and r4, r2, r7, ror #26 eor r7, r7, r4 and r4, r2, r7, ror #6 eor r7, r7, r4, ror #6 and r4, r2, r7, ror #10 eor r7, r7, r4, ror #4 and r4, r2, r8, ror #26 eor r8, r8, r4 and r4, r2, r8, ror #6 eor r8, r8, r4, ror #6 and r4, r2, r8, ror #10 eor r8, r8, r4, ror #4 and r4, r2, r9, ror #26 eor r9, r9, r4 and r4, r2, r9, ror #6 eor r9, r9, r4, ror #6 and r4, r2, r9, ror #10 eor r9, r9, r4, ror #4 and r4, r2, r10, ror #26 eor r10, r10, r4 and r4, r2, r10, ror #6 eor r10, r10, r4, ror #6 and r4, r2, r10, ror #10 eor r10, r10, r4, ror #4 and r4, r2, r11, ror #26 eor r11, r11, r4 and r4, r2, r11, ror #6 eor r11, r11, r4, ror #6 and r4, r2, r11, ror #10 eor r11, r11, r4, ror #4 and r4, r2, r12, ror #26 eor r12, r12, r4 and r4, r2, r12, ror #6 eor r12, r12, r4, ror #6 and r4, r2, r12, ror #10 eor r12, r12, r4, ror #4 // inv_mixcolumns_2 --- ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r0], #-8 // load rtk1 eor r11, r11, r4 // add rtk1 eor r12, r12, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r9, r9, r2 // add rtk1 eor r10, r10, r3 // add rtk1 ldrd r4, r14, [r0], #-8 // load rtk1 eor r7, r7, r4 // add rtk1 eor r8, r8, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r5, r5, r2 // add rtk1 eor r6, r6, r3 // add rtk1 and r4, r7, r12 // state[2] & state[7] eor r8, r8, r4 // state[3] ^= (state[2] & state[7]) orn r4, r10, r11 // ~state[6] | state[5] eor r5, r5, r4 // state[0] ^= (~state[6] | state[5]) orn r4, r10, r9 // state[5] | ~state[4] eor r12, r12, r4 // state[7] ^= (state[5] | ~state[4]) orn r4, r5, r6 // ~state[1] | state[0] eor r7, r7, r4 // state[2] ^= (~state[1] | state[0]) and r4, r6, r11 // state[1] & state[6] eor r10, r10, r4 // state[5] ^= (state[6] & state[1]) orr r4, r7, r8 // state[2] | state[3] eor r9, r9, r4 // state[4] ^= (state[2] | state[3]) orr r4, r5, r8 // state[0] | state[3] eor r6, r6, r4 // state[1] ^= (state[0] | state[3]) orr r4, r12, r9 // state[7] | state[4] eor r11, r11, r4 // state[6] ^= (state[7] | state[4]) movw r2, #0x3030 movt r2, #0x3030 // mask for inv_mixcolumns and r4, r2, r5, ror #16 // --- inv_mixcolumns_1 eor r5, r5, r4, ror #2 and r4, r2, r5, ror #28 eor r5, r5, r4 and r4, r2, r5, ror #16 eor r5, r5, r4, ror #30 and r4, r2, r6, ror #16 eor r6, r6, r4, ror #2 and r4, r2, r6, ror #28 eor r6, r6, r4 and r4, r2, r6, ror #16 eor r6, r6, r4, ror #30 and r4, r2, r7, ror #16 eor r7, r7, r4, ror #2 and r4, r2, r7, ror #28 eor r7, r7, r4 and r4, r2, r7, ror #16 eor r7, r7, r4, ror #30 and r4, r2, r8, ror #16 eor r8, r8, r4, ror #2 and r4, r2, r8, ror #28 eor r8, r8, r4 and r4, r2, r8, ror #16 eor r8, r8, r4, ror #30 and r4, r2, r9, ror #16 eor r9, r9, r4, ror #2 and r4, r2, r9, ror #28 eor r9, r9, r4 and r4, r2, r9, ror #16 eor r9, r9, r4, ror #30 and r4, r2, r10, ror #16 eor r10, r10, r4, ror #2 and r4, r2, r10, ror #28 eor r10, r10, r4 and r4, r2, r10, ror #16 eor r10, r10, r4, ror #30 and r4, r2, r11, ror #16 eor r11, r11, r4, ror #2 and r4, r2, r11, ror #28 eor r11, r11, r4 and r4, r2, r11, ror #16 eor r11, r11, r4, ror #30 and r4, r2, r12, ror #16 eor r12, r12, r4, ror #2 and r4, r2, r12, ror #28 eor r12, r12, r4 and r4, r2, r12, ror #16 eor r12, r12, r4, ror #30 // inv_mixcolumns_1 --- ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r0], #-8 // load rtk1 eor r11, r11, r4 // add rtk1 eor r12, r12, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r9, r9, r2 // add rtk1 eor r10, r10, r3 // add rtk1 ldrd r4, r14, [r0], #-8 // load rtk1 eor r7, r7, r4 // add rtk1 eor r8, r8, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r5, r5, r2 // add rtk1 eor r6, r6, r3 // add rtk1 and r4, r5, r7 // state[0] & state[2] eor r6, r6, r4 // state[1] ^= (state[0] & state[2]) orn r4, r12, r9 // state[7] | ~state[4] eor r11, r11, r4 // state[6] ^= (~state[4] | state[7]) orn r4, r12, r8 // state[7] | ~state[3] eor r7, r7, r4 // state[2] ^= (state[7] | ~state[3]) orn r4, r11, r10 // ~state[5] | state[6] eor r5, r5, r4 // state[0] ^= (~state[5] | state[6]) and r4, r9, r10 // state[4] & state[5] eor r12, r12, r4 // state[7] ^= (state[4] & state[5]) orr r4, r5, r6 // state[0] | state[1] eor r8, r8, r4 // state[3] ^= (state[0] | state[1]) orr r4, r6, r11 // state[1] | state[6] eor r10, r10, r4 // state[5] ^= (state[6] | state[1]) orr r4, r7, r8 // state[2] | state[3] eor r9, r9, r4 // state[4] ^= (state[2] | state[3]) movw r2, #0x3030 movt r2, #0x3030 // mask for inv_mixcolumns and r4, r2, r5, ror #6 // --- inv_mixcolumns_0 eor r5, r5, r4, ror #4 and r4, r2, r5, ror #18 eor r5, r5, r4, ror #2 and r4, r2, r5, ror #30 eor r5, r5, r4, ror #24 and r4, r2, r6, ror #6 eor r6, r6, r4, ror #4 and r4, r2, r6, ror #18 eor r6, r6, r4, ror #2 and r4, r2, r6, ror #30 eor r6, r6, r4, ror #24 and r4, r2, r7, ror #6 eor r7, r7, r4, ror #4 and r4, r2, r7, ror #18 eor r7, r7, r4, ror #2 and r4, r2, r7, ror #30 eor r7, r7, r4, ror #24 and r4, r2, r8, ror #6 eor r8, r8, r4, ror #4 and r4, r2, r8, ror #18 eor r8, r8, r4, ror #2 and r4, r2, r8, ror #30 eor r8, r8, r4, ror #24 and r4, r2, r9, ror #6 eor r9, r9, r4, ror #4 and r4, r2, r9, ror #18 eor r9, r9, r4, ror #2 and r4, r2, r9, ror #30 eor r9, r9, r4, ror #24 and r4, r2, r10, ror #6 eor r10, r10, r4, ror #4 and r4, r2, r10, ror #18 eor r10, r10, r4, ror #2 and r4, r2, r10, ror #30 eor r10, r10, r4, ror #24 and r4, r2, r11, ror #6 eor r11, r11, r4, ror #4 and r4, r2, r11, ror #18 eor r11, r11, r4, ror #2 and r4, r2, r11, ror #30 eor r11, r11, r4, ror #24 and r4, r2, r12, ror #6 eor r12, r12, r4, ror #4 and r4, r2, r12, ror #18 eor r12, r12, r4, ror #2 and r4, r2, r12, ror #30 eor r12, r12, r4, ror #24 ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r11, r11, r4 // add rtk_2_3 + rconst eor r12, r12, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r9, r9, r2 // add rtk_2_3 + rconst eor r10, r10, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r1], #-8 // load rtk_2_3 eor r7, r7, r4 // add rtk_2_3 + rconst eor r8, r8, r14 // add rtk_2_3 + rconst ldrd r2, r3, [r1], #-8 // load rtk_2_3 eor r5, r5, r2 // add rtk_2_3 + rconst eor r6, r6, r3 // add rtk_2_3 + rconst ldrd r4, r14, [r0], #-8 // load rtk1 eor r11, r11, r4 // add rtk1 eor r12, r12, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r9, r9, r2 // add rtk1 eor r10, r10, r3 // add rtk1 ldrd r4, r14, [r0], #-8 // load rtk1 eor r7, r7, r4 // add rtk1 eor r8, r8, r14 // add rtk1 ldrd r2, r3, [r0], #-8 // load rtk1 eor r5, r5, r2 // add rtk1 eor r6, r6, r3 // add rtk1 ldr.w r14, [sp] // restore link register and r4, r5, r11 // state[0] & state[6] eor r10, r10, r4 // state[5] ^= (state[6] & state[0]) orn r4, r7, r8 // ~state[3] | state[2] eor r9, r9, r4 // state[4] ^= (~state[3] | state[2]) orn r4, r7, r6 // state[2] | ~state[1] eor r5, r5, r4 // state[0] ^= (state[2] | ~state[1]) orn r4, r9, r12 // ~state[7] | state[4] eor r11, r11, r4 // state[6] ^= (~state[7] | state[4]) and r4, r8, r12 // state[3] & state[7] eor r7, r7, r4 // state[2] ^= (state[3] & state[7]) orr r4, r11, r10 // state[6] | state[5] eor r6, r6, r4 // state[1] ^= (state[6] | state[5]) orr r4, r9, r10 // state[4] | state[5] eor r12, r12, r4 // state[7] ^= (state[4] | state[5]) orr r4, r5, r6 // state[0] | state[1] eor r8, r8, r4 // state[3] ^= (state[0] | state[1]) bx lr /****************************************************************************** * Compute the SKINNY block cipher on a single block in a fixsliced manner. ******************************************************************************/ @ void skinny128_384(u8* ctext, u8* ctext_bis, const u8* ptext, @ const u8* ptext_bis, const u32* rtk_1, const u32* rtk_2_3) .global skinny128_384 .type skinny128_384,%function .align 2 skinny128_384: push {r0-r12, r14} sub.w sp, #4 // to store r14 during subroutines ldm r2, {r5,r7,r9,r11} // load the 2nd block in r6,r8,r10,r12 ldm r3, {r6,r8,r10,r12} // load the 2nd block in r6,r8,r10,r12 movw r2, #0x5555 movt r2, #0x5555 //r2 <- 0x55555555 movw r3, #0x3030 movt r3, #0x3030 //r3 <- 0x30303030 bl packing ldrd r0, r1, [sp, #60] // get rtk addr (1st stack argument) bl quadruple_round bl quadruple_round bl quadruple_round bl quadruple_round sub.w r0, #512 bl quadruple_round bl quadruple_round bl quadruple_round bl quadruple_round sub.w r0, #512 bl quadruple_round bl quadruple_round bl unpacking ldrd r0, r1, [sp, #4] add.w sp, #12 stm r0, {r5, r7, r9, r11} // store the 1st enc block in [r0] stm r1, {r6, r8, r10, r12} // store the 2nd enc block in [r1] pop {r2-r12, r14} bx lr /****************************************************************************** * Compute the SKINNY block cipher on a single block in a fixsliced manner. ******************************************************************************/ @ void skinny128_384_inv(u8* ptext, u8* ptext_bis, const u8* ctext, @ const u8* ctext_bis, const u32* rtk_1, const u32* rtk_2_3) .global skinny128_384_inv .type skinny128_384_inv,%function .align 2 skinny128_384_inv: push {r0-r12, r14} sub.w sp, #4 // to store r14 during subroutines ldm r2, {r5,r7,r9,r11} // load the 2nd block in r6,r8,r10,r12 ldm r3, {r6,r8,r10,r12} // load the 2nd block in r6,r8,r10,r12 movw r2, #0x5555 movt r2, #0x5555 //r2 <- 0x55555555 movw r3, #0x3030 movt r3, #0x3030 //r3 <- 0x30303030 bl packing ldrd r0, r1, [sp, #60] // get rtk addr (1st stack argument) add.w r0, #248 // points to the last rtk1 add.w r1, #1272 // points to the last rtk2_3 bl inv_quadruple_round bl inv_quadruple_round add.w r0, #512 bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round add.w r0, #512 bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round bl inv_quadruple_round bl unpacking ldrd r0, r1, [sp, #4] add.w sp, #12 stm r0, {r5, r7, r9, r11} // store the 1st enc block in [r0] stm r1, {r6, r8, r10, r12} // store the 2nd enc block in [r1] pop {r2-r12,r14} bx lr