diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S index ddaaa5f..00ca88e 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S @@ -4,8 +4,11 @@ Sebastien Riou, May 27th 2020 Implementation optimized for ARM-Cortex-M0 (Size and Speed) */ +//define __DRYGASCON_ARM_SELECTOR_V6M__ or add drygascon128_arm_selector.h to includes +#ifndef __DRYGASCON_ARM_SELECTOR_V6M__ #include "drygascon128_arm_selector.h" +#endif #if defined(__DRYGASCON_ARM_SELECTOR_V6M__) .cpu cortex-m0 .syntax unified diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S index ac0b62e..dcee981 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S @@ -22,7 +22,11 @@ the 'v7m_fpu_x' can be used to prevent this attack. Note that implementation 'v7m_fpu' is faster (but requires FPU). */ +//define __DRYGASCON_ARM_SELECTOR_V7M__ or add drygascon128_arm_selector.h to includes + +#ifndef __DRYGASCON_ARM_SELECTOR_V7M__ #include "drygascon128_arm_selector.h" +#endif #if defined(__DRYGASCON_ARM_SELECTOR_V7M__) .cpu cortex-m3 .syntax unified @@ -114,145 +118,104 @@ drygascon128_g_v7m_main_loop: //r0 to r9: c //r11: constant to add as round constant //r14: pointer on C + push {r14} // addition of round constant //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - ldr r12,[r14,#R32_1-C0] - eors r12,r12,r1 - str r12,[r14,#R32_1-C0] - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - ldr r12,[r14,#R32_0-C0] - eors r12,r12,r0 - str r12,[r14,#R32_0-C0] - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - ldr r12,[r14,#R32_3-C0] - eors r12,r12,r3 - str r12,[r14,#R32_3-C0] - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - ldr r12,[r14,#R32_2-C0] - eors r12,r12,r2 - str r12,[r14,#R32_2-C0] - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - ldr r12,[r14,#R32_0-C0] - eors r12,r12,r5 - str r12,[r14,#R32_0-C0] - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - ldr r12,[r14,#R32_3-C0] - eors r12,r12,r4 - str r12,[r14,#R32_3-C0] - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - ldr r12,[r14,#R32_2-C0] - eors r12,r12,r7 - str r12,[r14,#R32_2-C0] - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 - ldr r12,[r14,#R32_1-C0] - eors r12,r12,r6 - str r12,[r14,#R32_1-C0] + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + + + pop {r14} + + // accumulate + adds r12,r14,#R0 + LDMIA.W r12, {r10,r11} + eor r10,r10,r0 + eor r11,r11,r1 + eor r10,r10,r5 + eor r11,r11,r6 + STMIA.W r12, {r10,r11} + + adds r12,r14,#R1 + LDMIA.W r12, {r10,r11} + eor r10,r10,r2 + eor r11,r11,r3 + eor r10,r10,r7 + eor r11,r11,r4 + STMIA.W r12, {r10,r11} //state: //r0 to r9: c @@ -391,120 +354,83 @@ drygascon128_f_v7m_mix128_coreround: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#16] cmp r10,#130 @@ -562,6 +488,7 @@ drygascon128_g0_v7m: //r11 = ((0xf - 0) << 4) | 0; movs r11,#0xf0 + push {r14} //state: //r0 to r9: c //r11: constant to add as round constant @@ -570,120 +497,84 @@ drygascon128_g0_v7m: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed + pop {r14} //update C STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S index 326ba93..a4b0f10 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S @@ -20,9 +20,13 @@ Reference manual) shows data cache lines of 16 bytes. - In the unlikely case in which none of the condition can be met, the 'v7m_fpu_x' can be used to prevent this attack. */ +//define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ or add drygascon128_arm_selector.h to includes + +#ifndef __DRYGASCON_ARM_SELECTOR_V7M_FPU__ #include "drygascon128_arm_selector.h" -#if defined(__DRYGASCON_ARM_SELECTOR_FPU__) -.cpu cortex-m3 +#endif +#if defined(__DRYGASCON_ARM_SELECTOR_V7M_FPU__) +.cpu cortex-m4 .syntax unified .code 16 .thumb_func @@ -80,10 +84,11 @@ drygascon128_g_v7m_fpu: // 0 state address //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + movs r10,#0 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //round=r10=rounds-1; subs r11,r1,#1 @@ -117,141 +122,100 @@ drygascon128_g_v7m_fpu_main_loop: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - - vmov r14,S11 //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //r14 is R32_1 - eors r14,r14,r1 - vmov r12,S10 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - //r12 is R32_0 - eors r12,r12,r0 - //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //r12 is R32_0 - eors r12,r12,r5 - vmov S10,r12 - vmov r12,S13 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - //r12 is R32_3 - eors r12,r12,r4 + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //r12 is R32_3 - eors r12,r12,r3 - vmov S13,r12 - vmov r12,S12 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - //r12 is R32_2 - eors r12,r12,r2 + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 + //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //r12 is R32_2 - eors r12,r12,r7 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + + // accumulate + vmov r10,S10 + vmov r11,S11 + vmov r12,S12 + vmov r14,S13 + eor r10,r10,r0 + eor r11,r11,r1 + eor r12,r12,r2 + eor r14,r14,r3 + eor r10,r10,r5 + eor r11,r11,r6 + eor r12,r12,r7 + eor r14,r14,r4 + vmov S10,r10 + vmov S11,r11 vmov S12,r12 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 - //r14 is R32_1 - eors r14,r14,r6 - vmov S11,r14 + vmov S13,r14 //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#4] @@ -295,10 +259,10 @@ drygascon128_f_v7m_fpu: push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7 push {r1,r2} //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //Load C adds r11,r0,#C0 @@ -392,120 +356,83 @@ drygascon128_f_v7m_fpu_mix128_coreround: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#16] cmp r10,#130 @@ -562,6 +489,7 @@ drygascon128_g0_v7m_fpu: //r11 = ((0xf - 0) << 4) | 0; movs r11,#0xf0 + push {r14} //state: //r0 to r9: c //r11: constant to add as round constant @@ -570,120 +498,84 @@ drygascon128_g0_v7m_fpu: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed + pop {r14} //update C STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S index 6bf1cbc..5071eee 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S @@ -7,8 +7,12 @@ Include protection against timing attack on X look up operations Note that implementation 'v7m_fpu' is faster and safe on all Cortex-M7 as of May 2020. */ +//define __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__ or add drygascon128_arm_selector.h to includes + +#ifndef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__ #include "drygascon128_arm_selector.h" -#if defined(__DRYGASCON_ARM_SELECTOR_FPU_X__) +#endif +#if defined(__DRYGASCON_ARM_SELECTOR_V7M_FPU_X__) .cpu cortex-m7 .syntax unified .code 16 @@ -67,10 +71,11 @@ drygascon128_g_v7m_fpu_x: // 0 state address //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + movs r10,#0 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //round=r10=rounds-1; subs r11,r1,#1 @@ -104,141 +109,100 @@ drygascon128_g_v7m_fpu_x_main_loop: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - - vmov r14,S11 //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //r14 is R32_1 - eors r14,r14,r1 - vmov r12,S10 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - //r12 is R32_0 - eors r12,r12,r0 - //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //r12 is R32_0 - eors r12,r12,r5 - vmov S10,r12 - vmov r12,S13 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - //r12 is R32_3 - eors r12,r12,r4 + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //r12 is R32_3 - eors r12,r12,r3 - vmov S13,r12 - vmov r12,S12 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - //r12 is R32_2 - eors r12,r12,r2 + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 + //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //r12 is R32_2 - eors r12,r12,r7 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + + // accumulate + vmov r10,S10 + vmov r11,S11 + vmov r12,S12 + vmov r14,S13 + eor r10,r10,r0 + eor r11,r11,r1 + eor r12,r12,r2 + eor r14,r14,r3 + eor r10,r10,r5 + eor r11,r11,r6 + eor r12,r12,r7 + eor r14,r14,r4 + vmov S10,r10 + vmov S11,r11 vmov S12,r12 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 - //r14 is R32_1 - eors r14,r14,r6 - vmov S11,r14 + vmov S13,r14 //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#4] @@ -282,10 +246,10 @@ drygascon128_f_v7m_fpu_x: push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7 push {r1,r2} //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //Load C adds r11,r0,#C0 @@ -390,120 +354,83 @@ drygascon128_f_v7m_fpu_x_mix128_coreround: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#16] cmp r10,#130 @@ -560,6 +487,7 @@ drygascon128_g0_v7m_fpu_x: //r11 = ((0xf - 0) << 4) | 0; movs r11,#0xf0 + push {r14} //state: //r0 to r9: c //r11: constant to add as round constant @@ -568,120 +496,84 @@ drygascon128_g0_v7m_fpu_x: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed + pop {r14} //update C STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h index fb2275a..9c86969 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h @@ -3,41 +3,73 @@ //Optional file to select the best implementation for each chip #ifdef STM32H743xx - #define __DRYGASCON_ARM_SELECTOR_V7M__ - #define __DRYGASCON_ARM_SELECTOR_FPU__ + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef STM32F746xx + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ +#endif + +#ifdef STM32F411xx + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ +#endif + +#ifdef STM32L552xx //technically it is V8M but we don't have a specific code for that one #define __DRYGASCON_ARM_SELECTOR_V7M__ - #define __DRYGASCON_ARM_SELECTOR_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef STM32F103xx #define __DRYGASCON_ARM_SELECTOR_V7M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef STM32L011xx #define __DRYGASCON_ARM_SELECTOR_V6M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef __SAM3X8E__ #define __DRYGASCON_ARM_SELECTOR_V7M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif //TODO: add more chips here -#ifdef __DRYGASCON_ARM_SELECTOR_V7M__ - #ifdef __DRYGASCON_ARM_SELECTOR_FPU__ - #define DRYGASCON_G_OPT drygascon128_g_v7m_fpu - #define DRYGASCON_F_OPT drygascon128_f_v7m_fpu - #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu - #else - #define DRYGASCON_G_OPT drygascon128_g_v7m - #define DRYGASCON_F_OPT drygascon128_f_v7m - #define DRYGASCON_G0_OPT drygascon128_g0_v7m +#ifndef __DRYGASCON_ARM_SELECTOR_FOUND__ + //more generic defines catching whole families + #if defined(STM32F4xx) || defined(STM32F7xx) || defined(STM32H7xx) + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ + #endif + + #if defined(STM32F1xx) + #define __DRYGASCON_ARM_SELECTOR_V7M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #endif +#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define DRYGASCON_G_OPT drygascon128_g_v7m_fpu + #define DRYGASCON_F_OPT drygascon128_f_v7m_fpu + #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu +#endif + +#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__ + #define DRYGASCON_G_OPT drygascon128_g_v7m_fpu_x + #define DRYGASCON_F_OPT drygascon128_f_v7m_fpu_x + #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu_x +#endif + +#ifdef __DRYGASCON_ARM_SELECTOR_V7M__ + #define DRYGASCON_G_OPT drygascon128_g_v7m + #define DRYGASCON_F_OPT drygascon128_f_v7m + #define DRYGASCON_G0_OPT drygascon128_g0_v7m +#endif + #ifdef __DRYGASCON_ARM_SELECTOR_V6M__ #define DRYGASCON_G_OPT drygascon128_g_v6m #define DRYGASCON_F_OPT drygascon128_f_v6m diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h index 0f907bd..977958c 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h @@ -245,7 +245,7 @@ typedef union */ typedef struct { - gascon128_state_t c; /**< GASCON-128 state for the capacity */ + gascon128_state_t c; /**< GASCON-128 state for the capacity */ uint32_t domain; /**< Domain value to mix on next F call */ uint32_t rounds; /**< Number of rounds for next G call */ drysponge128_rate_t r; /**< Buffer for a rate block of data */ diff --git a/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c index a1061d5..5902d16 100644 --- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c @@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c index 1da4277..5a20b37 100644 --- a/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c @@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) diff --git a/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c index 87718e3..326d617 100644 --- a/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c @@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2 u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c index c818cf2..0c364f3 100644 --- a/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c @@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) @@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk1(u32* rtk1, const u8* tk1) { memset(rtk1, 0x00, 16*16); permute_tk(rtk1, tk1, 16); -} \ No newline at end of file +} diff --git a/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c index a1061d5..5902d16 100644 --- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c @@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c index 1da4277..5a20b37 100644 --- a/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c @@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) diff --git a/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c b/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c index 87718e3..326d617 100644 --- a/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c +++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c @@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2 u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c b/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c index c818cf2..0c364f3 100644 --- a/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c +++ b/romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c @@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tmp, tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) @@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk1(u32* rtk1, const u8* tk1) { memset(rtk1, 0x00, 16*16); permute_tk(rtk1, tk1, 16); -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c index 4a1b26e..fbe0318 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c @@ -8,12 +8,10 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c index 2082889..d37c68f 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /****************************************************************************** * The MixColumns computation for rounds i such that (i % 4) == 0 @@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } @@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3); + for(int i = 9; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } \ No newline at end of file diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c index 1da4277..7a1111b 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c @@ -4,16 +4,11 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include //for memcmp -#include "tk_schedule.h" +#include #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; - /****************************************************************************** * The round constants according to the new representation. ******************************************************************************/ @@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h index 5615cbd..d5acc39 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h @@ -1,22 +1,16 @@ #ifndef TK_SCHEDULE_H_ #define TK_SCHEDULE_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include +typedef uint8_t u8; +typedef uint32_t u32; void packing(u32* out, const u8* in); void unpacking(u8* out, u32 *in); void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3); void precompute_rtk1(u32* rtk1, const u8* tk1); -#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) - -#define XOR_BLOCKS(x,y) ({ \ - (x)[0] ^= (y)[0]; \ - (x)[1] ^= (y)[1]; \ - (x)[2] ^= (y)[2]; \ - (x)[3] ^= (y)[3]; \ -}) +#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) #define SWAPMOVE(a, b, mask, n) ({ \ tmp = (b ^ (a >> n)) & mask; \ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c index fa46817..edf906c 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c @@ -8,12 +8,10 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). @@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, u8 feedback; u8 tmp[2*BLOCKBYTES]; memset(tmp, 0x00, 2*BLOCKBYTES); - memset(auth, 0x00, BLOCKBYTES); SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); + memset(auth, 0x00, BLOCKBYTES); while (adlen >= 2*BLOCKBYTES) { LE_STR_64(tmp, lfsr); UPDATE_LFSR(lfsr); LE_STR_64(tmp + BLOCKBYTES, lfsr); - SET_DOMAIN(tmp + BLOCKBYTES, 0x02); precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES); skinny128_384_plus_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk); xor_block(auth, tmp); @@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, adlen -= 2*BLOCKBYTES; ad += 2*BLOCKBYTES; UPDATE_LFSR(lfsr); + memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM + SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); } if (adlen > BLOCKBYTES) { // pad and process 2 blocs in // LE_STR_64(tmp, lfsr); @@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, LE_STR_64(tmp, lfsr); if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_plus_encrypt(auth, c, ad, c, *tk); + skinny128_384_plus_encrypt(tmp, c, ad, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_plus_encrypt(auth, auth, ad, ad, *tk); + skinny128_384_plus_encrypt(tmp, tmp, ad, ad, *tk); } + xor_block(auth, tmp); } else if (adlen > 0) { LE_STR_64(tmp, lfsr); SET_DOMAIN(tmp, 0x03); // domain for padding ad @@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, tmp[BLOCKBYTES + adlen] ^= 0x80; // padding if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_plus_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); + skinny128_384_plus_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_plus_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); + skinny128_384_plus_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); } + xor_block(auth, tmp); } } @@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, feedback |= sum[i] ^ c[i]; // constant-time tag verification return feedback; // ----------------- Process the associated data ----------------- -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c index ed1e619..01d9f61 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /**************************************************************************** * The MixColumns operation for rounds i such that (i % 4) == 0. @@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 0 +* The inverse MixColumns operation for rounds i such that (i % 4) == 0 ****************************************************************************/ void inv_mixcolumns_0(u32* state) { u32 tmp; @@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 1 +* The inverse MixColumns operation for rounds i such that (i % 4) == 1 ****************************************************************************/ void inv_mixcolumns_1(u32* state) { u32 tmp; @@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 2 +* The inverse MixColumns operation for rounds i such that (i % 4) == 2 ****************************************************************************/ void inv_mixcolumns_2(u32* state) { u32 tmp; @@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 3 +* The inverse MixColumns operation for rounds i such that (i % 4) == 3 ****************************************************************************/ void inv_mixcolumns_3(u32* state) { u32 tmp; @@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const tweakey tk) { u32 state[8]; packing(state, ptext, ptext_bis); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); + for(int i = 0; i < 10; i++) + QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ctext, ctext_bis, state); } @@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const tweakey tk) { u32 state[8]; packing(state, ctext, ctext_bis); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); + for(int i = 9; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ptext, ptext_bis, state); -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h index 01dd271..3be91ec 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h @@ -1,5 +1,6 @@ #ifndef SKINNY128_H_ #define SKINNY128_H_ + #include "tk_schedule.h" void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c index ae7a820..7dbe1c3 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c @@ -7,15 +7,11 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 *******************************************************************************/ -#include #include #include "tk_schedule.h" -typedef unsigned char u8; -typedef unsigned int u32; - /**************************************************************************** * The round constants according to the fixsliced representation. ****************************************************************************/ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h index 29a2ddb..3779f90 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h @@ -1,8 +1,10 @@ #ifndef TK_SCHEDULE_BS_H_ #define TK_SCHEDULE_BS_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include + +typedef uint8_t u8; +typedef uint32_t u32; typedef struct { u32 rtk1[8*16]; diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c index 838c830..fc9af45 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c @@ -1,19 +1,15 @@ /****************************************************************************** -* Constant-time implementation of SKINNY-AEAD-M1 (v1.1). -* -* Two blocks are treated in parallel with SKINNY-128-384 whenever possible. +* Constant-time implementation of SKINNY-AEAD-M1(v1). * * For more details, see the paper at: https:// * * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). @@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen, } LE_STR_64(tmp, lfsr); // lfsr for tag computation precompute_rtk1(rtk1, tmp); - for(int i = 0; i < 16; i++) { - printf("%08x %08x %08x %08x\n",rtk1[i*4], rtk1[i*4+1],rtk1[i*4+2],rtk1[i*4+3]); - } - for(int i = 0; i < 56; i++) { - printf("%08x %08x %08x %08x\n",rtk2_3[i*4], rtk2_3[i*4+1],rtk2_3[i*4+2],rtk2_3[i*4+3]); - } skinny128_384_encrypt(c, c, rtk1, rtk2_3); // compute the tag // ----------------- Process the plaintext ----------------- @@ -200,4 +190,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, feedback |= sum[i] ^ c[i]; // constant-time tag verification return feedback; // ----------------- Process the associated data ----------------- -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c index e6177a2..f0e11c9 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /****************************************************************************** * The MixColumns computation for rounds i such that (i % 4) == 0 @@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - QUADRUPLE_ROUND(state, rtk1, rtk2_3); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation } @@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1, u32 tmp; // used in SWAPMOVE macro u32 state[4]; // 128-bit state packing(state, ptext); // from byte to bitsliced representation - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+208); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+192); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+176); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+160); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+144); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+128); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+112); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+96); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+80); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3+64); - INV_QUADRUPLE_ROUND(state, rtk1+48, rtk2_3+48); - INV_QUADRUPLE_ROUND(state, rtk1+32, rtk2_3+32); - INV_QUADRUPLE_ROUND(state, rtk1+16, rtk2_3+16); - INV_QUADRUPLE_ROUND(state, rtk1, rtk2_3); + for(int i = 13; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, rtk1 + (i%4)*16, rtk2_3 + i*16); unpacking(ctext, state); // from bitsliced to byte representation -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c index c818cf2..b09a0b2 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c @@ -4,16 +4,11 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include //for memcmp -#include "tk_schedule.h" +#include #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; - /****************************************************************************** * The round constants according to the new representation. ******************************************************************************/ @@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) { } /****************************************************************************** +* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys. +* It is equivalent to the following 2 function calls: +* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS); +* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS); +* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform. +* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be +* explained by the fact that less memory accesses to 'tk' are computed. +* +* To save some code size, the loop can be replaced by the following one: +* for(int i = 0 ; i < rounds; i+=2) { +* lfsr2_bs(tk2); +* lfsr3_bs(tk3); +* tk[i*4+4] = tk2[0] ^ tk3[0]; +* tk[i*4+5] = tk2[1] ^ tk3[1]; +* tk[i*4+6] = tk2[2] ^ tk3[2]; +* tk[i*4+7] = tk2[3] ^ tk3[3]; +* } +* at the cost of some cycles (~260 on ARM Cortex-M). +******************************************************************************/ +void precompute_lfsr_tk2_3(u32* tk, const u8* t2, const u8* t3, const int rounds) { + u32 tk2[4], tk3[4]; + packing(tk2, t2); + packing(tk3, t3); + tk[0] = tk2[0] ^ tk3[0]; + tk[1] = tk2[1] ^ tk3[1]; + tk[2] = tk2[2] ^ tk3[2]; + tk[3] = tk2[3] ^ tk3[3]; + for(int i = 0 ; i < rounds; i+=8) { + tk2[0] ^= (tk2[2] & 0xaaaaaaaa); + tk2[0] = ((tk2[0] & 0xaaaaaaaa) >> 1) | ((tk2[0] << 1) & 0xaaaaaaaa); + tk3[3] ^= ((tk3[1] & 0xaaaaaaaa) >> 1); + tk3[3] = ((tk3[3] & 0xaaaaaaaa) >> 1) | ((tk3[3] << 1) & 0xaaaaaaaa); + tk[i*4+4] = tk2[1] ^ tk3[3]; + tk[i*4+5] = tk2[2] ^ tk3[0]; + tk[i*4+6] = tk2[3] ^ tk3[1]; + tk[i*4+7] = tk2[0] ^ tk3[2]; + tk2[1] ^= (tk2[3] & 0xaaaaaaaa); + tk2[1] = ((tk2[1] & 0xaaaaaaaa) >> 1) | ((tk2[1] << 1) & 0xaaaaaaaa); + tk3[2] ^= ((tk3[0] & 0xaaaaaaaa) >> 1); + tk3[2] = ((tk3[2] & 0xaaaaaaaa) >> 1) | ((tk3[2] << 1) & 0xaaaaaaaa); + tk[i*4+12] = tk2[2] ^ tk3[2]; + tk[i*4+13] = tk2[3] ^ tk3[3]; + tk[i*4+14] = tk2[0] ^ tk3[0]; + tk[i*4+15] = tk2[1] ^ tk3[1]; + tk2[2] ^= (tk2[0] & 0xaaaaaaaa); + tk2[2] = ((tk2[2] & 0xaaaaaaaa) >> 1) | ((tk2[2] << 1) & 0xaaaaaaaa); + tk3[1] ^= ((tk3[3] & 0xaaaaaaaa) >> 1); + tk3[1] = ((tk3[1] & 0xaaaaaaaa) >> 1) | ((tk3[1] << 1) & 0xaaaaaaaa); + tk[i*4+20] = tk2[3] ^ tk3[1]; + tk[i*4+21] = tk2[0] ^ tk3[2]; + tk[i*4+22] = tk2[1] ^ tk3[3]; + tk[i*4+23] = tk2[2] ^ tk3[0]; + tk2[3] ^= (tk2[1] & 0xaaaaaaaa); + tk2[3] = ((tk2[3] & 0xaaaaaaaa) >> 1) | ((tk2[3] << 1) & 0xaaaaaaaa); + tk3[0] ^= ((tk3[2] & 0xaaaaaaaa) >> 1); + tk3[0] = ((tk3[0] & 0xaaaaaaaa) >> 1) | ((tk3[0] << 1) & 0xaaaaaaaa); + tk[i*4+28] = tk2[0] ^ tk3[0]; + tk[i*4+29] = tk2[1] ^ tk3[1]; + tk[i*4+30] = tk2[2] ^ tk3[2]; + tk[i*4+31] = tk2[3] ^ tk3[3]; + } +} + +/****************************************************************************** * XOR TK with TK1 before applying the permutations. * The key is then rearranged to match the barrel shiftrows representation. ******************************************************************************/ @@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { u32 test; u32 tk1[4], tmp[4]; packing(tk1, key); - memcpy(tmp, tk, 16); - tmp[0] ^= tk1[0]; - tmp[1] ^= tk1[1]; - tmp[2] ^= tk1[2]; - tmp[3] ^= tk1[3]; + tmp[0] = tk[0] ^ tk1[0]; + tmp[1] = tk[1] ^ tk1[1]; + tmp[2] = tk[2] ^ tk1[2]; + tmp[3] = tk[3] ^ tk1[3]; for(int i = 0 ; i < rounds; i += 8) { test = (i % 16 < 8) ? 1 : 0; //to apply the right power of P tk[i*4] = tmp[2] & 0xf0f0f0f0; tk[i*4+1] = tmp[3] & 0xf0f0f0f0; tk[i*4+2] = tmp[0] & 0xf0f0f0f0; tk[i*4+3] = tmp[1] & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+4, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+4] ^ tk1[0]; + tmp[1] = tk[i*4+5] ^ tk1[1]; + tmp[2] = tk[i*4+6] ^ tk1[2]; + tmp[3] = tk[i*4+7] ^ tk1[3]; if (test) permute_tk_2(tmp); // applies P^2 else @@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+10] |= ROR(tmp[0],12) & 0x0c0c0c0c; tk[i*4+11] = ROR(tmp[1],28) & 0x03030303; tk[i*4+11] |= ROR(tmp[1],12) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+12, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+12] ^ tk1[0]; + tmp[1] = tk[i*4+13] ^ tk1[1]; + tmp[2] = tk[i*4+14] ^ tk1[2]; + tmp[3] = tk[i*4+15] ^ tk1[3]; if (test) permute_tk_4(tmp); // applies P^4 else @@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+17] = ROR(tmp[3], 16) & 0xf0f0f0f0; tk[i*4+18] = ROR(tmp[0], 16) & 0xf0f0f0f0; tk[i*4+19] = ROR(tmp[1], 16) & 0xf0f0f0f0; - memcpy(tmp, tk+i*4+20, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+20] ^ tk1[0]; + tmp[1] = tk[i*4+21] ^ tk1[1]; + tmp[2] = tk[i*4+22] ^ tk1[2]; + tmp[3] = tk[i*4+23] ^ tk1[3]; if (test) permute_tk_6(tmp); // applies P^6 else @@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { tk[i*4+26] |= ROR(tmp[0],28) & 0x0c0c0c0c; tk[i*4+27] = ROR(tmp[1],12) & 0x03030303; tk[i*4+27] |= ROR(tmp[1],28) & 0x0c0c0c0c; - memcpy(tmp, tk+i*4+28, 16); - XOR_BLOCKS(tmp, tk1); + tmp[0] = tk[i*4+28] ^ tk1[0]; + tmp[1] = tk[i*4+29] ^ tk1[1]; + tmp[2] = tk[i*4+30] ^ tk1[2]; + tmp[3] = tk[i*4+31] ^ tk1[3]; if (test) permute_tk_8(tmp); // applies P^8 for(int j = 0; j < 4; j++) { @@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) { ******************************************************************************/ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { memset(rtk, 0x00, 16*SKINNY128_384_ROUNDS); - precompute_lfsr_tk2(rtk, tk2, SKINNY128_384_ROUNDS); - precompute_lfsr_tk3(rtk, tk3, SKINNY128_384_ROUNDS); + precompute_lfsr_tk2_3(rtk, tk2, tk3, SKINNY128_384_ROUNDS); permute_tk(rtk, (u8*)(rtk+8), SKINNY128_384_ROUNDS); // rtk+8 is NULL for(int i = 0; i < SKINNY128_384_ROUNDS; i++) { // add rconsts for(int j = 0; j < 4; j++) @@ -376,4 +441,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) { void precompute_rtk1(u32* rtk1, const u8* tk1) { memset(rtk1, 0x00, 16*16); permute_tk(rtk1, tk1, 16); -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h index 5615cbd..81dcbef 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h @@ -1,22 +1,17 @@ #ifndef TK_SCHEDULE_H_ #define TK_SCHEDULE_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include + +typedef uint8_t u8; +typedef uint32_t u32; void packing(u32* out, const u8* in); void unpacking(u8* out, u32 *in); void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8* tk3); void precompute_rtk1(u32* rtk1, const u8* tk1); -#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) - -#define XOR_BLOCKS(x,y) ({ \ - (x)[0] ^= (y)[0]; \ - (x)[1] ^= (y)[1]; \ - (x)[2] ^= (y)[2]; \ - (x)[3] ^= (y)[3]; \ -}) +#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y)))) #define SWAPMOVE(a, b, mask, n) ({ \ tmp = (b ^ (a >> n)) & mask; \ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c index 640910e..61cf8b0 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c @@ -8,12 +8,10 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include "skinny128.h" #include "skinnyaead.h" #include -#include /****************************************************************************** * x ^= y where x, y are 128-bit blocks (16 bytes array). @@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, u8 feedback; u8 tmp[2*BLOCKBYTES]; memset(tmp, 0x00, 2*BLOCKBYTES); - memset(auth, 0x00, BLOCKBYTES); SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); + memset(auth, 0x00, BLOCKBYTES); while (adlen >= 2*BLOCKBYTES) { LE_STR_64(tmp, lfsr); UPDATE_LFSR(lfsr); LE_STR_64(tmp + BLOCKBYTES, lfsr); - SET_DOMAIN(tmp + BLOCKBYTES, 0x02); precompute_rtk1(tk->rtk1, tmp, tmp+BLOCKBYTES); skinny128_384_encrypt(tmp, tmp+BLOCKBYTES, ad, ad+BLOCKBYTES, *tk); xor_block(auth, tmp); @@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, adlen -= 2*BLOCKBYTES; ad += 2*BLOCKBYTES; UPDATE_LFSR(lfsr); + memset(tmp, 0x00, 2*BLOCKBYTES); // to save 32 bytes of RAM + SET_DOMAIN(tmp, 0x02); + SET_DOMAIN(tmp + BLOCKBYTES, 0x02); } if (adlen > BLOCKBYTES) { // pad and process 2 blocs in // LE_STR_64(tmp, lfsr); @@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, LE_STR_64(tmp, lfsr); if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_encrypt(auth, c, ad, c, *tk); + skinny128_384_encrypt(tmp, c, ad, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_encrypt(auth, auth, ad, ad, *tk); + skinny128_384_encrypt(tmp, tmp, ad, ad, *tk); } + xor_block(auth, tmp); } else if (adlen > 0) { LE_STR_64(tmp, lfsr); SET_DOMAIN(tmp, 0x03); // domain for padding ad @@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk, tmp[BLOCKBYTES + adlen] ^= 0x80; // padding if (mlen == 0) { // if tag has *NOT* been calculated yet precompute_rtk1(tk->rtk1, tmp, tag); // compute the tag - skinny128_384_encrypt(auth, c, tmp + BLOCKBYTES, c, *tk); + skinny128_384_encrypt(tmp, c, tmp + BLOCKBYTES, c, *tk); } else { // if tag has been calculated yet precompute_rtk1(tk->rtk1, tmp, tmp); // process last ad block - skinny128_384_encrypt(auth, auth, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); + skinny128_384_encrypt(tmp, tmp, tmp + BLOCKBYTES, tmp + BLOCKBYTES, *tk); } + xor_block(auth, tmp); } } @@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen, feedback |= sum[i] ^ c[i]; // constant-time tag verification return feedback; // ----------------- Process the associated data ----------------- -} \ No newline at end of file +} diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c index 2e1e9c3..304b899 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c @@ -16,12 +16,9 @@ * @author Alexandre Adomnicai, Nanyang Technological University, * alexandre.adomnicai@ntu.edu.sg * -* @date May 2020 +* @date June 2020 ******************************************************************************/ -#include -#include #include "skinny128.h" -#include "tk_schedule.h" /**************************************************************************** * The MixColumns operation for rounds i such that (i % 4) == 0. @@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 0 +* The inverse MixColumns operation for rounds i such that (i % 4) == 0 ****************************************************************************/ void inv_mixcolumns_0(u32* state) { u32 tmp; @@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 1 +* The inverse MixColumns operation for rounds i such that (i % 4) == 1 ****************************************************************************/ void inv_mixcolumns_1(u32* state) { u32 tmp; @@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 2 +* The inverse MixColumns operation for rounds i such that (i % 4) == 2 ****************************************************************************/ void inv_mixcolumns_2(u32* state) { u32 tmp; @@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) { } /**************************************************************************** -* The inverse MixColumns oepration for rounds i such that (i % 4) == 3 +* The inverse MixColumns operation for rounds i such that (i % 4) == 3 ****************************************************************************/ void inv_mixcolumns_3(u32* state) { u32 tmp; @@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, const u8* ptext_bis, const tweakey tk) { u32 state[8]; packing(state, ptext, ptext_bis); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); - QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320); - QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352); - QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384); - QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416); + for(int i = 0; i < 14; i++) + QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ctext, ctext_bis, state); } @@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, const u8* ctext_bis, const tweakey tk) { u32 state[8]; packing(state, ctext, ctext_bis); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+416); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+384); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+352); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+320); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+288); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+256); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+224); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+192); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+160); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3+128); - INV_QUADRUPLE_ROUND(state, tk.rtk1+96, tk.rtk2_3+96); - INV_QUADRUPLE_ROUND(state, tk.rtk1+64, tk.rtk2_3+64); - INV_QUADRUPLE_ROUND(state, tk.rtk1+32, tk.rtk2_3+32); - INV_QUADRUPLE_ROUND(state, tk.rtk1, tk.rtk2_3); + for(int i = 13; i >= 0; i--) + INV_QUADRUPLE_ROUND(state, tk.rtk1 + (i%4)*32, tk.rtk2_3 + i*32); unpacking(ptext, ptext_bis, state); } \ No newline at end of file diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h index 5500af8..b6ffcf4 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h @@ -3,9 +3,7 @@ #include "skinny128.h" -typedef unsigned char u8; -typedef unsigned int u32; -typedef unsigned long long u64; +typedef uint64_t u64; #define TAGBYTES 16 #define KEYBYTES 16 diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c index 3897777..528d0eb 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c @@ -9,13 +9,9 @@ * * @date May 2020 *******************************************************************************/ -#include #include #include "tk_schedule.h" -typedef unsigned char u8; -typedef unsigned int u32; - /**************************************************************************** * The round constants according to the fixsliced representation. ****************************************************************************/ diff --git a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h index c6d03ce..7b17342 100644 --- a/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h +++ b/skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h @@ -1,8 +1,10 @@ #ifndef TK_SCHEDULE_BS_H_ #define TK_SCHEDULE_BS_H_ -typedef unsigned char u8; -typedef unsigned int u32; +#include + +typedef uint8_t u8; +typedef uint32_t u32; typedef struct { u32 rtk1[8*16];