diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S index 930df1d..00ca88e 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S @@ -4,8 +4,12 @@ Sebastien Riou, May 27th 2020 Implementation optimized for ARM-Cortex-M0 (Size and Speed) */ +//define __DRYGASCON_ARM_SELECTOR_V6M__ or add drygascon128_arm_selector.h to includes -#if defined(__DRYGASCON_ARM_SELECTOR_H__) +#ifndef __DRYGASCON_ARM_SELECTOR_V6M__ +#include "drygascon128_arm_selector.h" +#endif +#if defined(__DRYGASCON_ARM_SELECTOR_V6M__) .cpu cortex-m0 .syntax unified .code 16 diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S index f0c5fa1..dcee981 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S @@ -22,7 +22,12 @@ the 'v7m_fpu_x' can be used to prevent this attack. Note that implementation 'v7m_fpu' is faster (but requires FPU). */ -#if defined(__DRYGASCON_ARM_SELECTOR_H__) +//define __DRYGASCON_ARM_SELECTOR_V7M__ or add drygascon128_arm_selector.h to includes + +#ifndef __DRYGASCON_ARM_SELECTOR_V7M__ +#include "drygascon128_arm_selector.h" +#endif +#if defined(__DRYGASCON_ARM_SELECTOR_V7M__) .cpu cortex-m3 .syntax unified .code 16 @@ -113,145 +118,104 @@ drygascon128_g_v7m_main_loop: //r0 to r9: c //r11: constant to add as round constant //r14: pointer on C + push {r14} // addition of round constant //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - ldr r12,[r14,#R32_1-C0] - eors r12,r12,r1 - str r12,[r14,#R32_1-C0] - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - ldr r12,[r14,#R32_0-C0] - eors r12,r12,r0 - str r12,[r14,#R32_0-C0] - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - ldr r12,[r14,#R32_3-C0] - eors r12,r12,r3 - str r12,[r14,#R32_3-C0] - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - ldr r12,[r14,#R32_2-C0] - eors r12,r12,r2 - str r12,[r14,#R32_2-C0] - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - ldr r12,[r14,#R32_0-C0] - eors r12,r12,r5 - str r12,[r14,#R32_0-C0] - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - ldr r12,[r14,#R32_3-C0] - eors r12,r12,r4 - str r12,[r14,#R32_3-C0] - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - ldr r12,[r14,#R32_2-C0] - eors r12,r12,r7 - str r12,[r14,#R32_2-C0] - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 - ldr r12,[r14,#R32_1-C0] - eors r12,r12,r6 - str r12,[r14,#R32_1-C0] + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + + + pop {r14} + + // accumulate + adds r12,r14,#R0 + LDMIA.W r12, {r10,r11} + eor r10,r10,r0 + eor r11,r11,r1 + eor r10,r10,r5 + eor r11,r11,r6 + STMIA.W r12, {r10,r11} + + adds r12,r14,#R1 + LDMIA.W r12, {r10,r11} + eor r10,r10,r2 + eor r11,r11,r3 + eor r10,r10,r7 + eor r11,r11,r4 + STMIA.W r12, {r10,r11} //state: //r0 to r9: c @@ -390,120 +354,83 @@ drygascon128_f_v7m_mix128_coreround: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#16] cmp r10,#130 @@ -561,6 +488,7 @@ drygascon128_g0_v7m: //r11 = ((0xf - 0) << 4) | 0; movs r11,#0xf0 + push {r14} //state: //r0 to r9: c //r11: constant to add as round constant @@ -569,120 +497,84 @@ drygascon128_g0_v7m: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed + pop {r14} //update C STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S index d016dc8..a4b0f10 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S @@ -20,8 +20,13 @@ Reference manual) shows data cache lines of 16 bytes. - In the unlikely case in which none of the condition can be met, the 'v7m_fpu_x' can be used to prevent this attack. */ -#if defined(__DRYGASCON_ARM_SELECTOR_H__) -.cpu cortex-m3 +//define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ or add drygascon128_arm_selector.h to includes + +#ifndef __DRYGASCON_ARM_SELECTOR_V7M_FPU__ +#include "drygascon128_arm_selector.h" +#endif +#if defined(__DRYGASCON_ARM_SELECTOR_V7M_FPU__) +.cpu cortex-m4 .syntax unified .code 16 .thumb_func @@ -79,10 +84,11 @@ drygascon128_g_v7m_fpu: // 0 state address //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + movs r10,#0 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //round=r10=rounds-1; subs r11,r1,#1 @@ -116,141 +122,100 @@ drygascon128_g_v7m_fpu_main_loop: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - - vmov r14,S11 //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //r14 is R32_1 - eors r14,r14,r1 - vmov r12,S10 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - //r12 is R32_0 - eors r12,r12,r0 - //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //r12 is R32_0 - eors r12,r12,r5 - vmov S10,r12 - vmov r12,S13 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - //r12 is R32_3 - eors r12,r12,r4 + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //r12 is R32_3 - eors r12,r12,r3 - vmov S13,r12 - vmov r12,S12 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - //r12 is R32_2 - eors r12,r12,r2 + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 + //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //r12 is R32_2 - eors r12,r12,r7 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + + // accumulate + vmov r10,S10 + vmov r11,S11 + vmov r12,S12 + vmov r14,S13 + eor r10,r10,r0 + eor r11,r11,r1 + eor r12,r12,r2 + eor r14,r14,r3 + eor r10,r10,r5 + eor r11,r11,r6 + eor r12,r12,r7 + eor r14,r14,r4 + vmov S10,r10 + vmov S11,r11 vmov S12,r12 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 - //r14 is R32_1 - eors r14,r14,r6 - vmov S11,r14 + vmov S13,r14 //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#4] @@ -294,10 +259,10 @@ drygascon128_f_v7m_fpu: push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7 push {r1,r2} //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //Load C adds r11,r0,#C0 @@ -391,120 +356,83 @@ drygascon128_f_v7m_fpu_mix128_coreround: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#16] cmp r10,#130 @@ -561,6 +489,7 @@ drygascon128_g0_v7m_fpu: //r11 = ((0xf - 0) << 4) | 0; movs r11,#0xf0 + push {r14} //state: //r0 to r9: c //r11: constant to add as round constant @@ -569,120 +498,84 @@ drygascon128_g0_v7m_fpu: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed + pop {r14} //update C STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S index 53472ea..5071eee 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S @@ -7,7 +7,12 @@ Include protection against timing attack on X look up operations Note that implementation 'v7m_fpu' is faster and safe on all Cortex-M7 as of May 2020. */ -#if defined(__DRYGASCON_ARM_SELECTOR_H__) +//define __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__ or add drygascon128_arm_selector.h to includes + +#ifndef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__ +#include "drygascon128_arm_selector.h" +#endif +#if defined(__DRYGASCON_ARM_SELECTOR_V7M_FPU_X__) .cpu cortex-m7 .syntax unified .code 16 @@ -66,10 +71,11 @@ drygascon128_g_v7m_fpu_x: // 0 state address //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + movs r10,#0 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //round=r10=rounds-1; subs r11,r1,#1 @@ -103,141 +109,100 @@ drygascon128_g_v7m_fpu_x_main_loop: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - - vmov r14,S11 //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //r14 is R32_1 - eors r14,r14,r1 - vmov r12,S10 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - //r12 is R32_0 - eors r12,r12,r0 - //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //r12 is R32_0 - eors r12,r12,r5 - vmov S10,r12 - vmov r12,S13 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - //r12 is R32_3 - eors r12,r12,r4 + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //r12 is R32_3 - eors r12,r12,r3 - vmov S13,r12 - vmov r12,S12 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - //r12 is R32_2 - eors r12,r12,r2 + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 + //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //r12 is R32_2 - eors r12,r12,r7 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + + // accumulate + vmov r10,S10 + vmov r11,S11 + vmov r12,S12 + vmov r14,S13 + eor r10,r10,r0 + eor r11,r11,r1 + eor r12,r12,r2 + eor r14,r14,r3 + eor r10,r10,r5 + eor r11,r11,r6 + eor r12,r12,r7 + eor r14,r14,r4 + vmov S10,r10 + vmov S11,r11 vmov S12,r12 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 - //r14 is R32_1 - eors r14,r14,r6 - vmov S11,r14 + vmov S13,r14 //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#4] @@ -281,10 +246,10 @@ drygascon128_f_v7m_fpu_x: push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7 push {r1,r2} //r=0 - VSUB.F32 S10, S10, S10 - VSUB.F32 S11, S11, S11 - VSUB.F32 S12, S12, S12 - VSUB.F32 S13, S13, S13 + vmov S10,r10 + vmov S11,r10 + vmov S12,r10 + vmov S13,r10 //Load C adds r11,r0,#C0 @@ -389,120 +354,83 @@ drygascon128_f_v7m_fpu_x_mix128_coreround: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed ldr r10,[sp,#16] cmp r10,#130 @@ -559,6 +487,7 @@ drygascon128_g0_v7m_fpu_x: //r11 = ((0xf - 0) << 4) | 0; movs r11,#0xf0 + push {r14} //state: //r0 to r9: c //r11: constant to add as round constant @@ -567,120 +496,84 @@ drygascon128_g0_v7m_fpu_x: //C2L ^= round constant; eors r4,r4,r11 - // substitution layer, lower half - eors r0,r0,r8 - eors r8,r8,r6 - eors r4,r4,r2 - mvns r10,r0 - mvns r11,r6 - mvns r12,r8 - ands r10,r10,r2 - ands r11,r11,r8 - eors r8,r8,r10 - ands r12,r12,r0 - mvns r10,r4 - ands r10,r10,r6 - eors r6,r6,r12 - mvns r12,r2 - ands r12,r12,r4 - eors r4,r4,r11 - eors r6,r6,r4 - mvns r4,r4 - eors r0,r0,r12 - eors r2,r2,r10 - eors r2,r2,r0 - eors r0,r0,r8 - - // substitution layer, upper half - eors r1,r1,r9 - eors r9,r9,r7 - eors r5,r5,r3 - mvns r10,r1 - mvns r11,r7 - mvns r12,r9 - ands r10,r10,r3 - ands r11,r11,r9 - eors r9,r9,r10 - ands r12,r12,r1 - mvns r10,r5 - ands r10,r10,r7 - eors r7,r7,r12 - mvns r12,r3 - ands r12,r12,r5 - eors r5,r5,r11 - eors r7,r7,r5 - mvns r5,r5 - eors r1,r1,r12 - eors r3,r3,r10 - eors r3,r3,r1 - eors r1,r1,r9 - + eor r0, r0, r8 + eor r1, r1, r9 + eor r8, r8, r6 + eor r9, r9, r7 + eor r4, r4, r2 + eor r5, r5, r3 + bic r10, r0, r8 + bic r11, r8, r6 + bic r12, r4, r2 + bic r14, r2, r0 + eor r4, r4, r11 + eor r0, r0, r12 + eor r8, r8, r14 + bic r14, r6, r4 + eor r6, r6, r10 + bic r12, r1, r9 + bic r10, r5, r3 + bic r11, r9, r7 + eor r2, r2, r14 + eor r1, r1, r10 + eor r5, r5, r11 + bic r14, r3, r1 + bic r10, r7, r5 + eor r7, r7, r12 + eor r7, r7, r5 + eor r9, r9, r14 + eor r3, r3, r10 + eor r6, r6, r4 + eor r2, r2, r0 + eor r3, r3, r1 + eor r0, r0, r8 + eor r1, r1, r9 + mvn r4, r4 + mvn r5, r5 // linear diffusion layer - //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); - //c4 high part - rors r11,r9,#(20) - eors r9,r11,r9 - rors r10,r8,#(4) - eors r9,r10,r9 - //c4 low part - rors r11,r11,#((32-20+3)%32) - eors r11,r11,r8 - rors r10,r8,#(20) - eors r8,r10,r11 - //c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19); - //c0 high part - rors r11,r1,#(14) - eors r1,r11,r1 - rors r10,r0,#(10) - eors r1,r10,r1 - //c0 low part - rors r11,r11,#((32-14+9)%32) - eors r11,r11,r0 - rors r10,r0,#(14) - eors r0,r10,r11 - + //c0 step 1 + eor r11, r1, r0, ror #10 + eor r10, r0, r1, ror #9 //c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61); - //c1 high part - rors r11,r3,#(19) - eors r3,r11,r3 - rors r10,r2,#(31) - eors r3,r10,r3 - //c1 low part - rors r11,r11,#((32-19+30)%32) - eors r11,r11,r2 - rors r10,r2,#(19) - eors r2,r10,r11 - + //c1 step 1 + eor r14, r3, r2, ror #31 + eor r12, r2, r3, ror #30 + //c0 step 2 + eor r1, r11, r1, ror #14 + eor r0, r10, r0, ror #14 + //c1 step 2 + eor r3, r14, r3, ror #19 + eor r2, r12, r2, ror #19 //c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1); - //c2 high part - rors r11,r5,#(3) - eors r5,r11,r5 - rors r10,r4,#(1) - eors r5,r10,r5 - //c2 low part - rors r11,r11,#((32-3+0)%32) - eors r11,r11,r4 - rors r10,r4,#(3) - eors r4,r10,r11 - + //c2 step 1 + eor r11, r5, r4, ror #1 + eor r10, r4, r5, ror #0 //c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17); - //c3 high part - rors r11,r7,#(5) - eors r7,r11,r7 - rors r10,r6,#(9) - eors r7,r10,r7 - //c3 low part - rors r11,r11,#((32-5+8)%32) - eors r11,r11,r6 - rors r10,r6,#(5) - eors r6,r10,r11 + //c3 step 1 + eor r14, r7, r6, ror #9 + eor r12, r6, r7, ror #8 + //c2 step 2 + eor r5, r11, r5, ror #3 + eor r4, r10, r4, ror #3 + //c3 step 2 + eor r7, r14, r7, ror #5 + eor r6, r12, r6, ror #5 + //c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7); + //c4 step 1 + eor r11, r9, r8, ror #4 + eor r10, r8, r9, ror #3 + //c4 step 2 + eor r9, r11, r9, ror #20 + eor r8, r10, r8, ror #20 + //state: //r0 to r9: c - //r10,r11,r12 destroyed + //r10,r11,r12,r14 destroyed + pop {r14} //update C STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h index fb2275a..9c86969 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h @@ -3,41 +3,73 @@ //Optional file to select the best implementation for each chip #ifdef STM32H743xx - #define __DRYGASCON_ARM_SELECTOR_V7M__ - #define __DRYGASCON_ARM_SELECTOR_FPU__ + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef STM32F746xx + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ +#endif + +#ifdef STM32F411xx + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ +#endif + +#ifdef STM32L552xx //technically it is V8M but we don't have a specific code for that one #define __DRYGASCON_ARM_SELECTOR_V7M__ - #define __DRYGASCON_ARM_SELECTOR_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef STM32F103xx #define __DRYGASCON_ARM_SELECTOR_V7M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef STM32L011xx #define __DRYGASCON_ARM_SELECTOR_V6M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #ifdef __SAM3X8E__ #define __DRYGASCON_ARM_SELECTOR_V7M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif //TODO: add more chips here -#ifdef __DRYGASCON_ARM_SELECTOR_V7M__ - #ifdef __DRYGASCON_ARM_SELECTOR_FPU__ - #define DRYGASCON_G_OPT drygascon128_g_v7m_fpu - #define DRYGASCON_F_OPT drygascon128_f_v7m_fpu - #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu - #else - #define DRYGASCON_G_OPT drygascon128_g_v7m - #define DRYGASCON_F_OPT drygascon128_f_v7m - #define DRYGASCON_G0_OPT drygascon128_g0_v7m +#ifndef __DRYGASCON_ARM_SELECTOR_FOUND__ + //more generic defines catching whole families + #if defined(STM32F4xx) || defined(STM32F7xx) || defined(STM32H7xx) + #define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ + #endif + + #if defined(STM32F1xx) + #define __DRYGASCON_ARM_SELECTOR_V7M__ + #define __DRYGASCON_ARM_SELECTOR_FOUND__ #endif #endif +#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU__ + #define DRYGASCON_G_OPT drygascon128_g_v7m_fpu + #define DRYGASCON_F_OPT drygascon128_f_v7m_fpu + #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu +#endif + +#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__ + #define DRYGASCON_G_OPT drygascon128_g_v7m_fpu_x + #define DRYGASCON_F_OPT drygascon128_f_v7m_fpu_x + #define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu_x +#endif + +#ifdef __DRYGASCON_ARM_SELECTOR_V7M__ + #define DRYGASCON_G_OPT drygascon128_g_v7m + #define DRYGASCON_F_OPT drygascon128_f_v7m + #define DRYGASCON_G0_OPT drygascon128_g0_v7m +#endif + #ifdef __DRYGASCON_ARM_SELECTOR_V6M__ #define DRYGASCON_G_OPT drygascon128_g_v6m #define DRYGASCON_F_OPT drygascon128_f_v6m diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c index ca1e9f9..275b50f 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c @@ -8,7 +8,7 @@ int crypto_aead_encrypt const unsigned char *npub, const unsigned char *k) { - return drygascon128_aead_encrypt + return drygascon128k16_aead_encrypt (c, clen, m, mlen, ad, adlen, nsec, npub, k); } @@ -20,6 +20,6 @@ int crypto_aead_decrypt const unsigned char *npub, const unsigned char *k) { - return drygascon128_aead_decrypt + return drygascon128k16_aead_decrypt (m, mlen, nsec, c, clen, ad, adlen, npub, k); } diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h index 0f907bd..977958c 100644 --- a/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h @@ -245,7 +245,7 @@ typedef union */ typedef struct { - gascon128_state_t c; /**< GASCON-128 state for the capacity */ + gascon128_state_t c; /**< GASCON-128 state for the capacity */ uint32_t domain; /**< Domain value to mix on next F call */ uint32_t rounds; /**< Number of rounds for next G call */ drysponge128_rate_t r; /**< Buffer for a rate block of data */