Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
lwc
/
candidates
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
cad26506
authored
Jun 04, 2020
by
Enrico Pozzobon
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'email-submissions'
parents
121de979
a3a77713
Hide whitespace changes
Inline
Side-by-side
Showing
36 changed files
with
1365 additions
and
1403 deletions
+1365
-1403
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
+5
-1
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
+228
-336
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
+236
-343
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
+235
-342
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
+44
-12
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
+2
-2
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
+1
-1
romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c
+3
-10
romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c
+85
-15
romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c
+3
-14
romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c
+86
-17
romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c
+3
-10
romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c
+85
-15
romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c
+3
-14
romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c
+86
-17
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c
+1
-3
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c
+6
-24
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h
+1
-3
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c
+87
-22
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h
+4
-10
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c
+13
-11
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c
+10
-30
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h
+1
-0
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h
+1
-3
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c
+1
-5
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h
+4
-2
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
+3
-14
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
+6
-34
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
+1
-3
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
+88
-24
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
+5
-10
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
+13
-11
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
+10
-36
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
+1
-3
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
+0
-4
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h
+4
-2
No files found.
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v6m.S
View file @
cad26506
...
@@ -4,8 +4,12 @@ Sebastien Riou, May 27th 2020
...
@@ -4,8 +4,12 @@ Sebastien Riou, May 27th 2020
Implementation optimized for ARM-Cortex-M0 (Size and Speed)
Implementation optimized for ARM-Cortex-M0 (Size and Speed)
*/
*/
//define __DRYGASCON_ARM_SELECTOR_V6M__ or add drygascon128_arm_selector.h to includes
#if defined(__DRYGASCON_ARM_SELECTOR_H__)
#ifndef __DRYGASCON_ARM_SELECTOR_V6M__
#include "drygascon128_arm_selector.h"
#endif
#if defined(__DRYGASCON_ARM_SELECTOR_V6M__)
.cpu cortex-m0
.cpu cortex-m0
.syntax unified
.syntax unified
.code 16
.code 16
...
...
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m.S
View file @
cad26506
...
@@ -22,7 +22,12 @@ the 'v7m_fpu_x' can be used to prevent this attack.
...
@@ -22,7 +22,12 @@ the 'v7m_fpu_x' can be used to prevent this attack.
Note that implementation 'v7m_fpu' is faster (but requires FPU).
Note that implementation 'v7m_fpu' is faster (but requires FPU).
*/
*/
#if defined(__DRYGASCON_ARM_SELECTOR_H__)
//define __DRYGASCON_ARM_SELECTOR_V7M__ or add drygascon128_arm_selector.h to includes
#ifndef __DRYGASCON_ARM_SELECTOR_V7M__
#include "drygascon128_arm_selector.h"
#endif
#if defined(__DRYGASCON_ARM_SELECTOR_V7M__)
.cpu cortex-m3
.cpu cortex-m3
.syntax unified
.syntax unified
.code 16
.code 16
...
@@ -113,145 +118,104 @@ drygascon128_g_v7m_main_loop:
...
@@ -113,145 +118,104 @@ drygascon128_g_v7m_main_loop:
//r0 to r9: c
//r0 to r9: c
//r11: constant to add as round constant
//r11: constant to add as round constant
//r14: pointer on C
//r14: pointer on C
push {r14}
// addition of round constant
// addition of round constant
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
ldr r12,[r14,#R32_1-C0]
eors r12,r12,r1
str r12,[r14,#R32_1-C0]
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
ldr r12,[r14,#R32_0-C0]
eors r12,r12,r0
str r12,[r14,#R32_0-C0]
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
ldr r12,[r14,#R32_3-C0]
eor r0, r10, r0, ror #14
eors r12,r12,r3
//c1 step 2
str r12,[r14,#R32_3-C0]
eor r3, r14, r3, ror #19
//c1 low part
eor r2, r12, r2, ror #19
rors r11,r11,#((32-19+30)%32)
eors r11,r11,r2
rors r10,r2,#(19)
eors r2,r10,r11
ldr r12,[r14,#R32_2-C0]
eors r12,r12,r2
str r12,[r14,#R32_2-C0]
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
//c2 step 1
rors r11,r5,#(3)
eor r11, r5, r4, ror #1
eors r5,r11,r5
eor r10, r4, r5, ror #0
rors r10,r4,#(1)
eors r5,r10,r5
ldr r12,[r14,#R32_0-C0]
eors r12,r12,r5
str r12,[r14,#R32_0-C0]
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
ldr r12,[r14,#R32_3-C0]
eors r12,r12,r4
str r12,[r14,#R32_3-C0]
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
ldr r12,[r14,#R32_2-C0]
eor r4, r10, r4, ror #3
eors r12,r12,r7
//c3 step 2
str r12,[r14,#R32_2-C0]
eor r7, r14, r7, ror #5
//c3 low part
eor r6, r12, r6, ror #5
rors r11,r11,#((32-5+8)%32)
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
eors r11,r11,r6
//c4 step 1
rors r10,r6,#(5)
eor r11, r9, r8, ror #4
eors r6,r10,r11
eor r10, r8, r9, ror #3
ldr r12,[r14,#R32_1-C0]
//c4 step 2
eors r12,r12,r6
eor r9, r11, r9, ror #20
str r12,[r14,#R32_1-C0]
eor r8, r10, r8, ror #20
pop {r14}
// accumulate
adds r12,r14,#R0
LDMIA.W r12, {r10,r11}
eor r10,r10,r0
eor r11,r11,r1
eor r10,r10,r5
eor r11,r11,r6
STMIA.W r12, {r10,r11}
adds r12,r14,#R1
LDMIA.W r12, {r10,r11}
eor r10,r10,r2
eor r11,r11,r3
eor r10,r10,r7
eor r11,r11,r4
STMIA.W r12, {r10,r11}
//state:
//state:
//r0 to r9: c
//r0 to r9: c
...
@@ -390,120 +354,83 @@ drygascon128_f_v7m_mix128_coreround:
...
@@ -390,120 +354,83 @@ drygascon128_f_v7m_mix128_coreround:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//c1 low part
eor r0, r10, r0, ror #14
rors r11,r11,#((32-19+30)%32)
//c1 step 2
eors r11,r11,r2
eor r3, r14, r3, ror #19
rors r10,r2,#(19)
eor r2, r12, r2, ror #19
eors r2,r10,r11
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
//c2 step 1
rors r11,r5,#(3)
eor r11, r5, r4, ror #1
eors r5,r11,r5
eor r10, r4, r5, ror #0
rors r10,r4,#(1)
eors r5,r10,r5
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//c3 low part
eor r4, r10, r4, ror #3
rors r11,r11,#((32-5+8)%32)
//c3 step 2
eors r11,r11,r6
eor r7, r14, r7, ror #5
rors r10,r6,#(5)
eor r6, r12, r6, ror #5
eors r6,r10,r11
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12
,r14
destroyed
ldr r10,[sp,#16]
ldr r10,[sp,#16]
cmp r10,#130
cmp r10,#130
...
@@ -561,6 +488,7 @@ drygascon128_g0_v7m:
...
@@ -561,6 +488,7 @@ drygascon128_g0_v7m:
//r11 = ((0xf - 0) << 4) | 0;
//r11 = ((0xf - 0) << 4) | 0;
movs r11,#0xf0
movs r11,#0xf0
push {r14}
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r11: constant to add as round constant
//r11: constant to add as round constant
...
@@ -569,120 +497,84 @@ drygascon128_g0_v7m:
...
@@ -569,120 +497,84 @@ drygascon128_g0_v7m:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//c1 low part
eor r0, r10, r0, ror #14
rors r11,r11,#((32-19+30)%32)
//c1 step 2
eors r11,r11,r2
eor r3, r14, r3, ror #19
rors r10,r2,#(19)
eor r2, r12, r2, ror #19
eors r2,r10,r11
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
//c2 step 1
rors r11,r5,#(3)
eor r11, r5, r4, ror #1
eors r5,r11,r5
eor r10, r4, r5, ror #0
rors r10,r4,#(1)
eors r5,r10,r5
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//c3 low part
eor r4, r10, r4, ror #3
rors r11,r11,#((32-5+8)%32)
//c3 step 2
eors r11,r11,r6
eor r7, r14, r7, ror #5
rors r10,r6,#(5)
eor r6, r12, r6, ror #5
eors r6,r10,r11
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12,r14 destroyed
pop {r14}
//update C
//update C
STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
...
...
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu.S
View file @
cad26506
...
@@ -20,8 +20,13 @@ Reference manual) shows data cache lines of 16 bytes.
...
@@ -20,8 +20,13 @@ Reference manual) shows data cache lines of 16 bytes.
- In the unlikely case in which none of the condition can be met,
- In the unlikely case in which none of the condition can be met,
the 'v7m_fpu_x' can be used to prevent this attack.
the 'v7m_fpu_x' can be used to prevent this attack.
*/
*/
#if defined(__DRYGASCON_ARM_SELECTOR_H__)
//define __DRYGASCON_ARM_SELECTOR_V7M_FPU__ or add drygascon128_arm_selector.h to includes
.cpu cortex-m3
#ifndef __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#include "drygascon128_arm_selector.h"
#endif
#if defined(__DRYGASCON_ARM_SELECTOR_V7M_FPU__)
.cpu cortex-m4
.syntax unified
.syntax unified
.code 16
.code 16
.thumb_func
.thumb_func
...
@@ -79,10 +84,11 @@ drygascon128_g_v7m_fpu:
...
@@ -79,10 +84,11 @@ drygascon128_g_v7m_fpu:
// 0 state address
// 0 state address
//r=0
//r=0
VSUB.F32 S10, S10, S10
movs r10,#0
VSUB.F32 S11, S11, S11
vmov S10,r10
VSUB.F32 S12, S12, S12
vmov S11,r10
VSUB.F32 S13, S13, S13
vmov S12,r10
vmov S13,r10
//round=r10=rounds-1;
//round=r10=rounds-1;
subs r11,r1,#1
subs r11,r1,#1
...
@@ -116,141 +122,100 @@ drygascon128_g_v7m_fpu_main_loop:
...
@@ -116,141 +122,100 @@ drygascon128_g_v7m_fpu_main_loop:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
vmov r14,S11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//r14 is R32_1
eors r14,r14,r1
vmov r12,S10
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//r12 is R32_0
eors r12,r12,r0
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
rors r11,r5,#(3)
eors r5,r11,r5
rors r10,r4,#(1)
eors r5,r10,r5
//r12 is R32_0
eors r12,r12,r5
vmov S10,r12
vmov r12,S13
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//r12 is R32_3
eors r12,r12,r4
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//r12 is R32_3
eor r0, r10, r0, ror #14
eors r12,r12,r3
//c1 step 2
vmov S13,r12
eor r3, r14, r3, ror #19
vmov r12,S12
eor r2, r12, r2, ror #19
//c1 low part
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
rors r11,r11,#((32-19+30)%32)
//c2 step 1
eors r11,r11,r2
eor r11, r5, r4, ror #1
rors r10,r2,#(19)
eor r10, r4, r5, ror #0
eors r2,r10,r11
//r12 is R32_2
eors r12,r12,r2
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//r12 is R32_2
eor r4, r10, r4, ror #3
eors r12,r12,r7
//c3 step 2
eor r7, r14, r7, ror #5
eor r6, r12, r6, ror #5
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
// accumulate
vmov r10,S10
vmov r11,S11
vmov r12,S12
vmov r14,S13
eor r10,r10,r0
eor r11,r11,r1
eor r12,r12,r2
eor r14,r14,r3
eor r10,r10,r5
eor r11,r11,r6
eor r12,r12,r7
eor r14,r14,r4
vmov S10,r10
vmov S11,r11
vmov S12,r12
vmov S12,r12
//c3 low part
vmov S13,r14
rors r11,r11,#((32-5+8)%32)
eors r11,r11,r6
rors r10,r6,#(5)
eors r6,r10,r11
//r14 is R32_1
eors r14,r14,r6
vmov S11,r14
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12
,r14
destroyed
ldr r10,[sp,#4]
ldr r10,[sp,#4]
...
@@ -294,10 +259,10 @@ drygascon128_f_v7m_fpu:
...
@@ -294,10 +259,10 @@ drygascon128_f_v7m_fpu:
push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
push {r1,r2}
push {r1,r2}
//r=0
//r=0
VSUB.F32 S10, S10, S
10
vmov S10,r
10
VSUB.F32 S11, S11, S11
vmov S11,r10
VSUB.F32 S12, S12, S12
vmov S12,r10
VSUB.F32 S13, S13, S13
vmov S13,r10
//Load C
//Load C
adds r11,r0,#C0
adds r11,r0,#C0
...
@@ -391,120 +356,83 @@ drygascon128_f_v7m_fpu_mix128_coreround:
...
@@ -391,120 +356,83 @@ drygascon128_f_v7m_fpu_mix128_coreround:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//c1 low part
eor r0, r10, r0, ror #14
rors r11,r11,#((32-19+30)%32)
//c1 step 2
eors r11,r11,r2
eor r3, r14, r3, ror #19
rors r10,r2,#(19)
eor r2, r12, r2, ror #19
eors r2,r10,r11
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
//c2 step 1
rors r11,r5,#(3)
eor r11, r5, r4, ror #1
eors r5,r11,r5
eor r10, r4, r5, ror #0
rors r10,r4,#(1)
eors r5,r10,r5
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//c3 low part
eor r4, r10, r4, ror #3
rors r11,r11,#((32-5+8)%32)
//c3 step 2
eors r11,r11,r6
eor r7, r14, r7, ror #5
rors r10,r6,#(5)
eor r6, r12, r6, ror #5
eors r6,r10,r11
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12
,r14
destroyed
ldr r10,[sp,#16]
ldr r10,[sp,#16]
cmp r10,#130
cmp r10,#130
...
@@ -561,6 +489,7 @@ drygascon128_g0_v7m_fpu:
...
@@ -561,6 +489,7 @@ drygascon128_g0_v7m_fpu:
//r11 = ((0xf - 0) << 4) | 0;
//r11 = ((0xf - 0) << 4) | 0;
movs r11,#0xf0
movs r11,#0xf0
push {r14}
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r11: constant to add as round constant
//r11: constant to add as round constant
...
@@ -569,120 +498,84 @@ drygascon128_g0_v7m_fpu:
...
@@ -569,120 +498,84 @@ drygascon128_g0_v7m_fpu:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//c1 low part
eor r0, r10, r0, ror #14
rors r11,r11,#((32-19+30)%32)
//c1 step 2
eors r11,r11,r2
eor r3, r14, r3, ror #19
rors r10,r2,#(19)
eor r2, r12, r2, ror #19
eors r2,r10,r11
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
//c2 step 1
rors r11,r5,#(3)
eor r11, r5, r4, ror #1
eors r5,r11,r5
eor r10, r4, r5, ror #0
rors r10,r4,#(1)
eors r5,r10,r5
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//c3 low part
eor r4, r10, r4, ror #3
rors r11,r11,#((32-5+8)%32)
//c3 step 2
eors r11,r11,r6
eor r7, r14, r7, ror #5
rors r10,r6,#(5)
eor r6, r12, r6, ror #5
eors r6,r10,r11
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12,r14 destroyed
pop {r14}
//update C
//update C
STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
...
...
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm-v7m_fpu_x.S
View file @
cad26506
...
@@ -7,7 +7,12 @@ Include protection against timing attack on X look up operations
...
@@ -7,7 +7,12 @@ Include protection against timing attack on X look up operations
Note that implementation 'v7m_fpu' is faster and safe on all Cortex-M7 as of May 2020.
Note that implementation 'v7m_fpu' is faster and safe on all Cortex-M7 as of May 2020.
*/
*/
#if defined(__DRYGASCON_ARM_SELECTOR_H__)
//define __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__ or add drygascon128_arm_selector.h to includes
#ifndef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__
#include "drygascon128_arm_selector.h"
#endif
#if defined(__DRYGASCON_ARM_SELECTOR_V7M_FPU_X__)
.cpu cortex-m7
.cpu cortex-m7
.syntax unified
.syntax unified
.code 16
.code 16
...
@@ -66,10 +71,11 @@ drygascon128_g_v7m_fpu_x:
...
@@ -66,10 +71,11 @@ drygascon128_g_v7m_fpu_x:
// 0 state address
// 0 state address
//r=0
//r=0
VSUB.F32 S10, S10, S10
movs r10,#0
VSUB.F32 S11, S11, S11
vmov S10,r10
VSUB.F32 S12, S12, S12
vmov S11,r10
VSUB.F32 S13, S13, S13
vmov S12,r10
vmov S13,r10
//round=r10=rounds-1;
//round=r10=rounds-1;
subs r11,r1,#1
subs r11,r1,#1
...
@@ -103,141 +109,100 @@ drygascon128_g_v7m_fpu_x_main_loop:
...
@@ -103,141 +109,100 @@ drygascon128_g_v7m_fpu_x_main_loop:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
vmov r14,S11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//r14 is R32_1
eors r14,r14,r1
vmov r12,S10
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//r12 is R32_0
eors r12,r12,r0
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
rors r11,r5,#(3)
eors r5,r11,r5
rors r10,r4,#(1)
eors r5,r10,r5
//r12 is R32_0
eors r12,r12,r5
vmov S10,r12
vmov r12,S13
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//r12 is R32_3
eors r12,r12,r4
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//r12 is R32_3
eor r0, r10, r0, ror #14
eors r12,r12,r3
//c1 step 2
vmov S13,r12
eor r3, r14, r3, ror #19
vmov r12,S12
eor r2, r12, r2, ror #19
//c1 low part
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
rors r11,r11,#((32-19+30)%32)
//c2 step 1
eors r11,r11,r2
eor r11, r5, r4, ror #1
rors r10,r2,#(19)
eor r10, r4, r5, ror #0
eors r2,r10,r11
//r12 is R32_2
eors r12,r12,r2
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//r12 is R32_2
eor r4, r10, r4, ror #3
eors r12,r12,r7
//c3 step 2
eor r7, r14, r7, ror #5
eor r6, r12, r6, ror #5
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
// accumulate
vmov r10,S10
vmov r11,S11
vmov r12,S12
vmov r14,S13
eor r10,r10,r0
eor r11,r11,r1
eor r12,r12,r2
eor r14,r14,r3
eor r10,r10,r5
eor r11,r11,r6
eor r12,r12,r7
eor r14,r14,r4
vmov S10,r10
vmov S11,r11
vmov S12,r12
vmov S12,r12
//c3 low part
vmov S13,r14
rors r11,r11,#((32-5+8)%32)
eors r11,r11,r6
rors r10,r6,#(5)
eors r6,r10,r11
//r14 is R32_1
eors r14,r14,r6
vmov S11,r14
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12
,r14
destroyed
ldr r10,[sp,#4]
ldr r10,[sp,#4]
...
@@ -281,10 +246,10 @@ drygascon128_f_v7m_fpu_x:
...
@@ -281,10 +246,10 @@ drygascon128_f_v7m_fpu_x:
push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
push {r0,r3,r10} //make the same stack frame as drygascon128_g_cm7
push {r1,r2}
push {r1,r2}
//r=0
//r=0
VSUB.F32 S10, S10, S
10
vmov S10,r
10
VSUB.F32 S11, S11, S11
vmov S11,r10
VSUB.F32 S12, S12, S12
vmov S12,r10
VSUB.F32 S13, S13, S13
vmov S13,r10
//Load C
//Load C
adds r11,r0,#C0
adds r11,r0,#C0
...
@@ -389,120 +354,83 @@ drygascon128_f_v7m_fpu_x_mix128_coreround:
...
@@ -389,120 +354,83 @@ drygascon128_f_v7m_fpu_x_mix128_coreround:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//c1 low part
eor r0, r10, r0, ror #14
rors r11,r11,#((32-19+30)%32)
//c1 step 2
eors r11,r11,r2
eor r3, r14, r3, ror #19
rors r10,r2,#(19)
eor r2, r12, r2, ror #19
eors r2,r10,r11
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
//c2 step 1
rors r11,r5,#(3)
eor r11, r5, r4, ror #1
eors r5,r11,r5
eor r10, r4, r5, ror #0
rors r10,r4,#(1)
eors r5,r10,r5
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//c3 low part
eor r4, r10, r4, ror #3
rors r11,r11,#((32-5+8)%32)
//c3 step 2
eors r11,r11,r6
eor r7, r14, r7, ror #5
rors r10,r6,#(5)
eor r6, r12, r6, ror #5
eors r6,r10,r11
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12
,r14
destroyed
ldr r10,[sp,#16]
ldr r10,[sp,#16]
cmp r10,#130
cmp r10,#130
...
@@ -559,6 +487,7 @@ drygascon128_g0_v7m_fpu_x:
...
@@ -559,6 +487,7 @@ drygascon128_g0_v7m_fpu_x:
//r11 = ((0xf - 0) << 4) | 0;
//r11 = ((0xf - 0) << 4) | 0;
movs r11,#0xf0
movs r11,#0xf0
push {r14}
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r11: constant to add as round constant
//r11: constant to add as round constant
...
@@ -567,120 +496,84 @@ drygascon128_g0_v7m_fpu_x:
...
@@ -567,120 +496,84 @@ drygascon128_g0_v7m_fpu_x:
//C2L ^= round constant;
//C2L ^= round constant;
eors r4,r4,r11
eors r4,r4,r11
// substitution layer, lower half
eor r0, r0, r8
eors r0,r0,r8
eor r1, r1, r9
eors r8,r8,r6
eor r8, r8, r6
eors r4,r4,r2
eor r9, r9, r7
mvns r10,r0
eor r4, r4, r2
mvns r11,r6
eor r5, r5, r3
mvns r12,r8
bic r10, r0, r8
ands r10,r10,r2
bic r11, r8, r6
ands r11,r11,r8
bic r12, r4, r2
eors r8,r8,r10
bic r14, r2, r0
ands r12,r12,r0
eor r4, r4, r11
mvns r10,r4
eor r0, r0, r12
ands r10,r10,r6
eor r8, r8, r14
eors r6,r6,r12
bic r14, r6, r4
mvns r12,r2
eor r6, r6, r10
ands r12,r12,r4
bic r12, r1, r9
eors r4,r4,r11
bic r10, r5, r3
eors r6,r6,r4
bic r11, r9, r7
mvns r4,r4
eor r2, r2, r14
eors r0,r0,r12
eor r1, r1, r10
eors r2,r2,r10
eor r5, r5, r11
eors r2,r2,r0
bic r14, r3, r1
eors r0,r0,r8
bic r10, r7, r5
eor r7, r7, r12
// substitution layer, upper half
eor r7, r7, r5
eors r1,r1,r9
eor r9, r9, r14
eors r9,r9,r7
eor r3, r3, r10
eors r5,r5,r3
eor r6, r6, r4
mvns r10,r1
eor r2, r2, r0
mvns r11,r7
eor r3, r3, r1
mvns r12,r9
eor r0, r0, r8
ands r10,r10,r3
eor r1, r1, r9
ands r11,r11,r9
mvn r4, r4
eors r9,r9,r10
mvn r5, r5
ands r12,r12,r1
mvns r10,r5
ands r10,r10,r7
eors r7,r7,r12
mvns r12,r3
ands r12,r12,r5
eors r5,r5,r11
eors r7,r7,r5
mvns r5,r5
eors r1,r1,r12
eors r3,r3,r10
eors r3,r3,r1
eors r1,r1,r9
// linear diffusion layer
// linear diffusion layer
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 high part
rors r11,r9,#(20)
eors r9,r11,r9
rors r10,r8,#(4)
eors r9,r10,r9
//c4 low part
rors r11,r11,#((32-20+3)%32)
eors r11,r11,r8
rors r10,r8,#(20)
eors r8,r10,r11
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 ^= gascon_rotr64_interleaved(c0, 28) ^ gascon_rotr64_interleaved(c0, 19);
//c0 high part
//c0 step 1
rors r11,r1,#(14)
eor r11, r1, r0, ror #10
eors r1,r11,r1
eor r10, r0, r1, ror #9
rors r10,r0,#(10)
eors r1,r10,r1
//c0 low part
rors r11,r11,#((32-14+9)%32)
eors r11,r11,r0
rors r10,r0,#(14)
eors r0,r10,r11
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 ^= gascon_rotr64_interleaved(c1, 38) ^ gascon_rotr64_interleaved(c1, 61);
//c1 high part
//c1 step 1
rors r11,r3,#(19)
eor r14, r3, r2, ror #31
eors r3,r11,r3
eor r12, r2, r3, ror #30
rors r10,r2,#(31)
//c0 step 2
eors r3,r10,r3
eor r1, r11, r1, ror #14
//c1 low part
eor r0, r10, r0, ror #14
rors r11,r11,#((32-19+30)%32)
//c1 step 2
eors r11,r11,r2
eor r3, r14, r3, ror #19
rors r10,r2,#(19)
eor r2, r12, r2, ror #19
eors r2,r10,r11
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 ^= gascon_rotr64_interleaved(c2, 6) ^ gascon_rotr64_interleaved(c2, 1);
//c2 high part
//c2 step 1
rors r11,r5,#(3)
eor r11, r5, r4, ror #1
eors r5,r11,r5
eor r10, r4, r5, ror #0
rors r10,r4,#(1)
eors r5,r10,r5
//c2 low part
rors r11,r11,#((32-3+0)%32)
eors r11,r11,r4
rors r10,r4,#(3)
eors r4,r10,r11
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 ^= gascon_rotr64_interleaved(c3, 10) ^ gascon_rotr64_interleaved(c3, 17);
//c3 high part
//c3 step 1
rors r11,r7,#(5)
eor r14, r7, r6, ror #9
eors r7,r11,r7
eor r12, r6, r7, ror #8
rors r10,r6,#(9)
//c2 step 2
eors r7,r10,r7
eor r5, r11, r5, ror #3
//c3 low part
eor r4, r10, r4, ror #3
rors r11,r11,#((32-5+8)%32)
//c3 step 2
eors r11,r11,r6
eor r7, r14, r7, ror #5
rors r10,r6,#(5)
eor r6, r12, r6, ror #5
eors r6,r10,r11
//c4 ^= gascon_rotr64_interleaved(c4, 40) ^ gascon_rotr64_interleaved(c4, 7);
//c4 step 1
eor r11, r9, r8, ror #4
eor r10, r8, r9, ror #3
//c4 step 2
eor r9, r11, r9, ror #20
eor r8, r10, r8, ror #20
//state:
//state:
//r0 to r9: c
//r0 to r9: c
//r10,r11,r12 destroyed
//r10,r11,r12,r14 destroyed
pop {r14}
//update C
//update C
STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
STMIA.W r14,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}
...
...
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/drygascon128_arm_selector.h
View file @
cad26506
...
@@ -3,41 +3,73 @@
...
@@ -3,41 +3,73 @@
//Optional file to select the best implementation for each chip
//Optional file to select the best implementation for each chip
#ifdef STM32H743xx
#ifdef STM32H743xx
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_V7M_
FPU_
_
#define __DRYGASCON_ARM_SELECTOR_F
PU
__
#define __DRYGASCON_ARM_SELECTOR_F
OUND
__
#endif
#endif
#ifdef STM32F746xx
#ifdef STM32F746xx
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32F411xx
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#ifdef STM32L552xx //technically it is V8M but we don't have a specific code for that one
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_F
PU
__
#define __DRYGASCON_ARM_SELECTOR_F
OUND
__
#endif
#endif
#ifdef STM32F103xx
#ifdef STM32F103xx
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#endif
#ifdef STM32L011xx
#ifdef STM32L011xx
#define __DRYGASCON_ARM_SELECTOR_V6M__
#define __DRYGASCON_ARM_SELECTOR_V6M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#endif
#ifdef __SAM3X8E__
#ifdef __SAM3X8E__
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#endif
//TODO: add more chips here
//TODO: add more chips here
#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
#ifndef __DRYGASCON_ARM_SELECTOR_FOUND__
#ifdef __DRYGASCON_ARM_SELECTOR_FPU__
//more generic defines catching whole families
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu
#if defined(STM32F4xx) || defined(STM32F7xx) || defined(STM32H7xx)
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu
#define __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#else
#endif
#define DRYGASCON_G_OPT drygascon128_g_v7m
#define DRYGASCON_F_OPT drygascon128_f_v7m
#if defined(STM32F1xx)
#define DRYGASCON_G0_OPT drygascon128_g0_v7m
#define __DRYGASCON_ARM_SELECTOR_V7M__
#define __DRYGASCON_ARM_SELECTOR_FOUND__
#endif
#endif
#endif
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU__
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M_FPU_X__
#define DRYGASCON_G_OPT drygascon128_g_v7m_fpu_x
#define DRYGASCON_F_OPT drygascon128_f_v7m_fpu_x
#define DRYGASCON_G0_OPT drygascon128_g0_v7m_fpu_x
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V7M__
#define DRYGASCON_G_OPT drygascon128_g_v7m
#define DRYGASCON_F_OPT drygascon128_f_v7m
#define DRYGASCON_G0_OPT drygascon128_g0_v7m
#endif
#ifdef __DRYGASCON_ARM_SELECTOR_V6M__
#ifdef __DRYGASCON_ARM_SELECTOR_V6M__
#define DRYGASCON_G_OPT drygascon128_g_v6m
#define DRYGASCON_G_OPT drygascon128_g_v6m
#define DRYGASCON_F_OPT drygascon128_f_v6m
#define DRYGASCON_F_OPT drygascon128_f_v6m
...
...
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/encrypt.c
View file @
cad26506
...
@@ -8,7 +8,7 @@ int crypto_aead_encrypt
...
@@ -8,7 +8,7 @@ int crypto_aead_encrypt
const
unsigned
char
*
npub
,
const
unsigned
char
*
npub
,
const
unsigned
char
*
k
)
const
unsigned
char
*
k
)
{
{
return
drygascon128_aead_encrypt
return
drygascon128
k16
_aead_encrypt
(
c
,
clen
,
m
,
mlen
,
ad
,
adlen
,
nsec
,
npub
,
k
);
(
c
,
clen
,
m
,
mlen
,
ad
,
adlen
,
nsec
,
npub
,
k
);
}
}
...
@@ -20,6 +20,6 @@ int crypto_aead_decrypt
...
@@ -20,6 +20,6 @@ int crypto_aead_decrypt
const
unsigned
char
*
npub
,
const
unsigned
char
*
npub
,
const
unsigned
char
*
k
)
const
unsigned
char
*
k
)
{
{
return
drygascon128_aead_decrypt
return
drygascon128
k16
_aead_decrypt
(
m
,
mlen
,
nsec
,
c
,
clen
,
ad
,
adlen
,
npub
,
k
);
(
m
,
mlen
,
nsec
,
c
,
clen
,
ad
,
adlen
,
npub
,
k
);
}
}
drygascon/Implementations/crypto_aead/drygascon128/add_arm_cortex-m/internal-drysponge.h
View file @
cad26506
...
@@ -245,7 +245,7 @@ typedef union
...
@@ -245,7 +245,7 @@ typedef union
*/
*/
typedef
struct
typedef
struct
{
{
gascon128_state_t
c
;
/**< GASCON-128 state for the capacity */
gascon128_state_t
c
;
/**< GASCON-128 state for the capacity */
uint32_t
domain
;
/**< Domain value to mix on next F call */
uint32_t
domain
;
/**< Domain value to mix on next F call */
uint32_t
rounds
;
/**< Number of rounds for next G call */
uint32_t
rounds
;
/**< Number of rounds for next G call */
drysponge128_rate_t
r
;
/**< Buffer for a rate block of data */
drysponge128_rate_t
r
;
/**< Buffer for a rate block of data */
...
...
romulus/Implementations/crypto_aead/romulusm1+/opt32/skinny128.c
View file @
cad26506
...
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
...
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
\ No newline at end of file
romulus/Implementations/crypto_aead/romulusm1+/opt32/tk_schedule.c
View file @
cad26506
...
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
...
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
}
/******************************************************************************
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void
precompute_lfsr_tk2_3
(
u32
*
tk
,
const
u8
*
t2
,
const
u8
*
t3
,
const
int
rounds
)
{
u32
tmp
,
tk2
[
4
],
tk3
[
4
];
packing
(
tk2
,
t2
);
packing
(
tk3
,
t3
);
tk
[
0
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
1
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
2
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
3
]
=
tk2
[
3
]
^
tk3
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
tk2
[
0
]
^=
(
tk2
[
2
]
&
0xaaaaaaaa
);
tk2
[
0
]
=
((
tk2
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
3
]
^=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
3
]
=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
4
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
5
]
=
tk2
[
2
]
^
tk3
[
0
];
tk
[
i
*
4
+
6
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
7
]
=
tk2
[
0
]
^
tk3
[
2
];
tk2
[
1
]
^=
(
tk2
[
3
]
&
0xaaaaaaaa
);
tk2
[
1
]
=
((
tk2
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
2
]
^=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
2
]
=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
12
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
13
]
=
tk2
[
3
]
^
tk3
[
3
];
tk
[
i
*
4
+
14
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
15
]
=
tk2
[
1
]
^
tk3
[
1
];
tk2
[
2
]
^=
(
tk2
[
0
]
&
0xaaaaaaaa
);
tk2
[
2
]
=
((
tk2
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
1
]
^=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
1
]
=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
20
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
21
]
=
tk2
[
0
]
^
tk3
[
2
];
tk
[
i
*
4
+
22
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
23
]
=
tk2
[
2
]
^
tk3
[
0
];
tk2
[
3
]
^=
(
tk2
[
1
]
&
0xaaaaaaaa
);
tk2
[
3
]
=
((
tk2
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
0
]
^=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
0
]
=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
28
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
29
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
i
*
4
+
30
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
31
]
=
tk2
[
3
]
^
tk3
[
3
];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
******************************************************************************/
...
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32
test
;
u32
test
;
u32
tk1
[
4
],
tmp
[
4
];
u32
tk1
[
4
],
tmp
[
4
];
packing
(
tk1
,
key
);
packing
(
tk1
,
key
);
memcpy
(
tmp
,
tk
,
16
);
tmp
[
0
]
=
tk
[
0
]
^
tk1
[
0
];
tmp
[
0
]
^=
tk1
[
0
];
tmp
[
1
]
=
tk
[
1
]
^
tk1
[
1
];
tmp
[
1
]
^=
tk1
[
1
];
tmp
[
2
]
=
tk
[
2
]
^
tk1
[
2
];
tmp
[
2
]
^=
tk1
[
2
];
tmp
[
3
]
=
tk
[
3
]
^
tk1
[
3
];
tmp
[
3
]
^=
tk1
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
4
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
4
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
5
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
6
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
7
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_2
(
tmp
);
// applies P^2
permute_tk_2
(
tmp
);
// applies P^2
else
else
...
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
12
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
12
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
13
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
14
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
15
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_4
(
tmp
);
// applies P^4
permute_tk_4
(
tmp
);
// applies P^4
else
else
...
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
20
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
20
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
21
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
22
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
23
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_6
(
tmp
);
// applies P^6
permute_tk_6
(
tmp
);
// applies P^6
else
else
...
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
28
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
28
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
29
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
30
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
31
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_8
(
tmp
);
// applies P^8
permute_tk_8
(
tmp
);
// applies P^8
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
...
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
******************************************************************************/
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2
(
rtk
,
tk2
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2_3
(
rtk
,
tk2
,
tk3
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk3
(
rtk
,
tk3
,
SKINNY128_384_ROUNDS
);
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
j
=
0
;
j
<
4
;
j
++
)
for
(
int
j
=
0
;
j
<
4
;
j
++
)
...
...
romulus/Implementations/crypto_aead/romulusm1/opt32/skinny128.c
View file @
cad26506
...
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
...
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
for
(
int
i
=
0
;
i
<
14
;
i
++
)
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
160
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
176
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
192
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
208
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
\ No newline at end of file
romulus/Implementations/crypto_aead/romulusm1/opt32/tk_schedule.c
View file @
cad26506
...
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
...
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
}
/******************************************************************************
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void
precompute_lfsr_tk2_3
(
u32
*
tk
,
const
u8
*
t2
,
const
u8
*
t3
,
const
int
rounds
)
{
u32
tmp
,
tk2
[
4
],
tk3
[
4
];
packing
(
tk2
,
t2
);
packing
(
tk3
,
t3
);
tk
[
0
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
1
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
2
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
3
]
=
tk2
[
3
]
^
tk3
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
tk2
[
0
]
^=
(
tk2
[
2
]
&
0xaaaaaaaa
);
tk2
[
0
]
=
((
tk2
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
3
]
^=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
3
]
=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
4
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
5
]
=
tk2
[
2
]
^
tk3
[
0
];
tk
[
i
*
4
+
6
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
7
]
=
tk2
[
0
]
^
tk3
[
2
];
tk2
[
1
]
^=
(
tk2
[
3
]
&
0xaaaaaaaa
);
tk2
[
1
]
=
((
tk2
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
2
]
^=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
2
]
=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
12
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
13
]
=
tk2
[
3
]
^
tk3
[
3
];
tk
[
i
*
4
+
14
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
15
]
=
tk2
[
1
]
^
tk3
[
1
];
tk2
[
2
]
^=
(
tk2
[
0
]
&
0xaaaaaaaa
);
tk2
[
2
]
=
((
tk2
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
1
]
^=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
1
]
=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
20
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
21
]
=
tk2
[
0
]
^
tk3
[
2
];
tk
[
i
*
4
+
22
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
23
]
=
tk2
[
2
]
^
tk3
[
0
];
tk2
[
3
]
^=
(
tk2
[
1
]
&
0xaaaaaaaa
);
tk2
[
3
]
=
((
tk2
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
0
]
^=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
0
]
=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
28
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
29
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
i
*
4
+
30
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
31
]
=
tk2
[
3
]
^
tk3
[
3
];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
******************************************************************************/
...
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32
test
;
u32
test
;
u32
tk1
[
4
],
tmp
[
4
];
u32
tk1
[
4
],
tmp
[
4
];
packing
(
tk1
,
key
);
packing
(
tk1
,
key
);
memcpy
(
tmp
,
tk
,
16
);
tmp
[
0
]
=
tk
[
0
]
^
tk1
[
0
];
tmp
[
0
]
^=
tk1
[
0
];
tmp
[
1
]
=
tk
[
1
]
^
tk1
[
1
];
tmp
[
1
]
^=
tk1
[
1
];
tmp
[
2
]
=
tk
[
2
]
^
tk1
[
2
];
tmp
[
2
]
^=
tk1
[
2
];
tmp
[
3
]
=
tk
[
3
]
^
tk1
[
3
];
tmp
[
3
]
^=
tk1
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
4
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
4
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
5
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
6
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
7
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_2
(
tmp
);
// applies P^2
permute_tk_2
(
tmp
);
// applies P^2
else
else
...
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
12
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
12
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
13
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
14
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
15
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_4
(
tmp
);
// applies P^4
permute_tk_4
(
tmp
);
// applies P^4
else
else
...
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
20
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
20
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
21
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
22
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
23
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_6
(
tmp
);
// applies P^6
permute_tk_6
(
tmp
);
// applies P^6
else
else
...
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
28
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
28
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
29
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
30
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
31
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_8
(
tmp
);
// applies P^8
permute_tk_8
(
tmp
);
// applies P^8
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
...
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
******************************************************************************/
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2
(
rtk
,
tk2
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2_3
(
rtk
,
tk2
,
tk3
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk3
(
rtk
,
tk3
,
SKINNY128_384_ROUNDS
);
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
j
=
0
;
j
<
4
;
j
++
)
for
(
int
j
=
0
;
j
<
4
;
j
++
)
...
@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
...
@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
)
{
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
)
{
memset
(
rtk1
,
0x00
,
16
*
16
);
memset
(
rtk1
,
0x00
,
16
*
16
);
permute_tk
(
rtk1
,
tk1
,
16
);
permute_tk
(
rtk1
,
tk1
,
16
);
}
}
\ No newline at end of file
romulus/Implementations/crypto_aead/romulusn1+/opt32/skinny128.c
View file @
cad26506
...
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
...
@@ -93,15 +93,7 @@ void skinny128_384_plus(u8* ctext, const u8* ptext, const u32* rtk1,
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
\ No newline at end of file
romulus/Implementations/crypto_aead/romulusn1+/opt32/tk_schedule.c
View file @
cad26506
...
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
...
@@ -260,6 +260,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
}
/******************************************************************************
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void
precompute_lfsr_tk2_3
(
u32
*
tk
,
const
u8
*
t2
,
const
u8
*
t3
,
const
int
rounds
)
{
u32
tmp
,
tk2
[
4
],
tk3
[
4
];
packing
(
tk2
,
t2
);
packing
(
tk3
,
t3
);
tk
[
0
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
1
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
2
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
3
]
=
tk2
[
3
]
^
tk3
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
tk2
[
0
]
^=
(
tk2
[
2
]
&
0xaaaaaaaa
);
tk2
[
0
]
=
((
tk2
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
3
]
^=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
3
]
=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
4
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
5
]
=
tk2
[
2
]
^
tk3
[
0
];
tk
[
i
*
4
+
6
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
7
]
=
tk2
[
0
]
^
tk3
[
2
];
tk2
[
1
]
^=
(
tk2
[
3
]
&
0xaaaaaaaa
);
tk2
[
1
]
=
((
tk2
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
2
]
^=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
2
]
=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
12
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
13
]
=
tk2
[
3
]
^
tk3
[
3
];
tk
[
i
*
4
+
14
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
15
]
=
tk2
[
1
]
^
tk3
[
1
];
tk2
[
2
]
^=
(
tk2
[
0
]
&
0xaaaaaaaa
);
tk2
[
2
]
=
((
tk2
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
1
]
^=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
1
]
=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
20
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
21
]
=
tk2
[
0
]
^
tk3
[
2
];
tk
[
i
*
4
+
22
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
23
]
=
tk2
[
2
]
^
tk3
[
0
];
tk2
[
3
]
^=
(
tk2
[
1
]
&
0xaaaaaaaa
);
tk2
[
3
]
=
((
tk2
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
0
]
^=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
0
]
=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
28
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
29
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
i
*
4
+
30
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
31
]
=
tk2
[
3
]
^
tk3
[
3
];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
******************************************************************************/
...
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -267,19 +331,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32
test
;
u32
test
;
u32
tk1
[
4
],
tmp
[
4
];
u32
tk1
[
4
],
tmp
[
4
];
packing
(
tk1
,
key
);
packing
(
tk1
,
key
);
memcpy
(
tmp
,
tk
,
16
);
tmp
[
0
]
=
tk
[
0
]
^
tk1
[
0
];
tmp
[
0
]
^=
tk1
[
0
];
tmp
[
1
]
=
tk
[
1
]
^
tk1
[
1
];
tmp
[
1
]
^=
tk1
[
1
];
tmp
[
2
]
=
tk
[
2
]
^
tk1
[
2
];
tmp
[
2
]
^=
tk1
[
2
];
tmp
[
3
]
=
tk
[
3
]
^
tk1
[
3
];
tmp
[
3
]
^=
tk1
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
4
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
4
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
5
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
6
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
7
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_2
(
tmp
);
// applies P^2
permute_tk_2
(
tmp
);
// applies P^2
else
else
...
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -296,8 +361,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
12
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
12
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
13
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
14
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
15
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_4
(
tmp
);
// applies P^4
permute_tk_4
(
tmp
);
// applies P^4
else
else
...
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -310,8 +377,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
20
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
20
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
21
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
22
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
23
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_6
(
tmp
);
// applies P^6
permute_tk_6
(
tmp
);
// applies P^6
else
else
...
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -328,8 +397,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
28
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
28
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
29
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
30
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
31
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_8
(
tmp
);
// applies P^8
permute_tk_8
(
tmp
);
// applies P^8
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
...
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -350,8 +421,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
******************************************************************************/
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2
(
rtk
,
tk2
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2_3
(
rtk
,
tk2
,
tk3
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk3
(
rtk
,
tk3
,
SKINNY128_384_ROUNDS
);
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
j
=
0
;
j
<
4
;
j
++
)
for
(
int
j
=
0
;
j
<
4
;
j
++
)
...
...
romulus/Implementations/crypto_aead/romulusn1/opt32/skinny128.c
View file @
cad26506
...
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
...
@@ -92,19 +92,7 @@ void skinny128_384(u8* ctext, const u8* ptext, const u32* rtk1, const u32* rtk2
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
for
(
int
i
=
0
;
i
<
14
;
i
++
)
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
160
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
176
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
192
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
208
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
\ No newline at end of file
romulus/Implementations/crypto_aead/romulusn1/opt32/tk_schedule.c
View file @
cad26506
...
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
...
@@ -271,6 +271,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
}
/******************************************************************************
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void
precompute_lfsr_tk2_3
(
u32
*
tk
,
const
u8
*
t2
,
const
u8
*
t3
,
const
int
rounds
)
{
u32
tmp
,
tk2
[
4
],
tk3
[
4
];
packing
(
tk2
,
t2
);
packing
(
tk3
,
t3
);
tk
[
0
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
1
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
2
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
3
]
=
tk2
[
3
]
^
tk3
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
tk2
[
0
]
^=
(
tk2
[
2
]
&
0xaaaaaaaa
);
tk2
[
0
]
=
((
tk2
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
3
]
^=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
3
]
=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
4
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
5
]
=
tk2
[
2
]
^
tk3
[
0
];
tk
[
i
*
4
+
6
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
7
]
=
tk2
[
0
]
^
tk3
[
2
];
tk2
[
1
]
^=
(
tk2
[
3
]
&
0xaaaaaaaa
);
tk2
[
1
]
=
((
tk2
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
2
]
^=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
2
]
=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
12
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
13
]
=
tk2
[
3
]
^
tk3
[
3
];
tk
[
i
*
4
+
14
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
15
]
=
tk2
[
1
]
^
tk3
[
1
];
tk2
[
2
]
^=
(
tk2
[
0
]
&
0xaaaaaaaa
);
tk2
[
2
]
=
((
tk2
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
1
]
^=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
1
]
=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
20
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
21
]
=
tk2
[
0
]
^
tk3
[
2
];
tk
[
i
*
4
+
22
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
23
]
=
tk2
[
2
]
^
tk3
[
0
];
tk2
[
3
]
^=
(
tk2
[
1
]
&
0xaaaaaaaa
);
tk2
[
3
]
=
((
tk2
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
0
]
^=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
0
]
=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
28
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
29
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
i
*
4
+
30
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
31
]
=
tk2
[
3
]
^
tk3
[
3
];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
******************************************************************************/
...
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -278,19 +342,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32
test
;
u32
test
;
u32
tk1
[
4
],
tmp
[
4
];
u32
tk1
[
4
],
tmp
[
4
];
packing
(
tk1
,
key
);
packing
(
tk1
,
key
);
memcpy
(
tmp
,
tk
,
16
);
tmp
[
0
]
=
tk
[
0
]
^
tk1
[
0
];
tmp
[
0
]
^=
tk1
[
0
];
tmp
[
1
]
=
tk
[
1
]
^
tk1
[
1
];
tmp
[
1
]
^=
tk1
[
1
];
tmp
[
2
]
=
tk
[
2
]
^
tk1
[
2
];
tmp
[
2
]
^=
tk1
[
2
];
tmp
[
3
]
=
tk
[
3
]
^
tk1
[
3
];
tmp
[
3
]
^=
tk1
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
4
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
4
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
5
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
6
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
7
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_2
(
tmp
);
// applies P^2
permute_tk_2
(
tmp
);
// applies P^2
else
else
...
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -307,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
12
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
12
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
13
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
14
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
15
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_4
(
tmp
);
// applies P^4
permute_tk_4
(
tmp
);
// applies P^4
else
else
...
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -321,8 +388,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
20
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
20
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
21
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
22
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
23
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_6
(
tmp
);
// applies P^6
permute_tk_6
(
tmp
);
// applies P^6
else
else
...
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -339,8 +408,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
28
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
28
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
29
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
30
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
31
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_8
(
tmp
);
// applies P^8
permute_tk_8
(
tmp
);
// applies P^8
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
...
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -361,8 +432,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
******************************************************************************/
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2
(
rtk
,
tk2
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2_3
(
rtk
,
tk2
,
tk3
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk3
(
rtk
,
tk3
,
SKINNY128_384_ROUNDS
);
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
j
=
0
;
j
<
4
;
j
++
)
for
(
int
j
=
0
;
j
<
4
;
j
++
)
...
@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
...
@@ -376,4 +446,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
)
{
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
)
{
memset
(
rtk1
,
0x00
,
16
*
16
);
memset
(
rtk1
,
0x00
,
16
*
16
);
permute_tk
(
rtk1
,
tk1
,
16
);
permute_tk
(
rtk1
,
tk1
,
16
);
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/encrypt.c
View file @
cad26506
...
@@ -8,12 +8,10 @@
...
@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include "skinnyaead.h"
#include <string.h>
#include <string.h>
#include <stdio.h>
/******************************************************************************
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
* x ^= y where x, y are 128-bit blocks (16 bytes array).
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinny128.c
View file @
cad26506
...
@@ -16,12 +16,9 @@
...
@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
* The MixColumns computation for rounds i such that (i % 4) == 0
...
@@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
...
@@ -153,16 +150,8 @@ void skinny128_384_plus_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
...
@@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
...
@@ -176,15 +165,7 @@ void skinny128_384_plus_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
for
(
int
i
=
9
;
i
>=
0
;
i
--
)
INV_QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/skinnyaead.h
View file @
cad26506
...
@@ -3,9 +3,7 @@
...
@@ -3,9 +3,7 @@
#include "skinny128.h"
#include "skinny128.h"
typedef
unsigned
char
u8
;
typedef
uint64_t
u64
;
typedef
unsigned
int
u32
;
typedef
unsigned
long
long
u64
;
#define TAGBYTES 16
#define TAGBYTES 16
#define KEYBYTES 16
#define KEYBYTES 16
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.c
View file @
cad26506
...
@@ -4,16 +4,11 @@
...
@@ -4,16 +4,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
#include "skinny128.h"
typedef
unsigned
char
u8
;
typedef
unsigned
int
u32
;
/******************************************************************************
/******************************************************************************
* The round constants according to the new representation.
* The round constants according to the new representation.
******************************************************************************/
******************************************************************************/
...
@@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
...
@@ -260,6 +255,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
}
/******************************************************************************
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void
precompute_lfsr_tk2_3
(
u32
*
tk
,
const
u8
*
t2
,
const
u8
*
t3
,
const
int
rounds
)
{
u32
tk2
[
4
],
tk3
[
4
];
packing
(
tk2
,
t2
);
packing
(
tk3
,
t3
);
tk
[
0
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
1
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
2
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
3
]
=
tk2
[
3
]
^
tk3
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
tk2
[
0
]
^=
(
tk2
[
2
]
&
0xaaaaaaaa
);
tk2
[
0
]
=
((
tk2
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
3
]
^=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
3
]
=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
4
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
5
]
=
tk2
[
2
]
^
tk3
[
0
];
tk
[
i
*
4
+
6
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
7
]
=
tk2
[
0
]
^
tk3
[
2
];
tk2
[
1
]
^=
(
tk2
[
3
]
&
0xaaaaaaaa
);
tk2
[
1
]
=
((
tk2
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
2
]
^=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
2
]
=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
12
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
13
]
=
tk2
[
3
]
^
tk3
[
3
];
tk
[
i
*
4
+
14
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
15
]
=
tk2
[
1
]
^
tk3
[
1
];
tk2
[
2
]
^=
(
tk2
[
0
]
&
0xaaaaaaaa
);
tk2
[
2
]
=
((
tk2
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
1
]
^=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
1
]
=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
20
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
21
]
=
tk2
[
0
]
^
tk3
[
2
];
tk
[
i
*
4
+
22
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
23
]
=
tk2
[
2
]
^
tk3
[
0
];
tk2
[
3
]
^=
(
tk2
[
1
]
&
0xaaaaaaaa
);
tk2
[
3
]
=
((
tk2
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
0
]
^=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
0
]
=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
28
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
29
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
i
*
4
+
30
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
31
]
=
tk2
[
3
]
^
tk3
[
3
];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
******************************************************************************/
...
@@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -267,19 +326,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32
test
;
u32
test
;
u32
tk1
[
4
],
tmp
[
4
];
u32
tk1
[
4
],
tmp
[
4
];
packing
(
tk1
,
key
);
packing
(
tk1
,
key
);
memcpy
(
tmp
,
tk
,
16
);
tmp
[
0
]
=
tk
[
0
]
^
tk1
[
0
];
tmp
[
0
]
^=
tk1
[
0
];
tmp
[
1
]
=
tk
[
1
]
^
tk1
[
1
];
tmp
[
1
]
^=
tk1
[
1
];
tmp
[
2
]
=
tk
[
2
]
^
tk1
[
2
];
tmp
[
2
]
^=
tk1
[
2
];
tmp
[
3
]
=
tk
[
3
]
^
tk1
[
3
];
tmp
[
3
]
^=
tk1
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
4
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
4
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
5
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
6
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
7
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_2
(
tmp
);
// applies P^2
permute_tk_2
(
tmp
);
// applies P^2
else
else
...
@@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -296,8 +356,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
12
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
12
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
13
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
14
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
15
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_4
(
tmp
);
// applies P^4
permute_tk_4
(
tmp
);
// applies P^4
else
else
...
@@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -310,8 +372,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
20
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
20
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
21
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
22
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
23
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_6
(
tmp
);
// applies P^6
permute_tk_6
(
tmp
);
// applies P^6
else
else
...
@@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -328,8 +392,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
28
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
28
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
29
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
30
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
31
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_8
(
tmp
);
// applies P^8
permute_tk_8
(
tmp
);
// applies P^8
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
...
@@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -350,8 +416,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
******************************************************************************/
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2
(
rtk
,
tk2
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2_3
(
rtk
,
tk2
,
tk3
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk3
(
rtk
,
tk3
,
SKINNY128_384_ROUNDS
);
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
j
=
0
;
j
<
4
;
j
++
)
for
(
int
j
=
0
;
j
<
4
;
j
++
)
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_1/tk_schedule.h
View file @
cad26506
#ifndef TK_SCHEDULE_H_
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef
unsigned
char
u8
;
#include <stdint.h>
typedef
unsigned
int
u32
;
typedef
uint8_t
u8
;
typedef
uint32_t
u32
;
void
packing
(
u32
*
out
,
const
u8
*
in
);
void
packing
(
u32
*
out
,
const
u8
*
in
);
void
unpacking
(
u8
*
out
,
u32
*
in
);
void
unpacking
(
u8
*
out
,
u32
*
in
);
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
);
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
);
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
);
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
tmp = (b ^ (a >> n)) & mask; \
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/encrypt.c
View file @
cad26506
...
@@ -8,12 +8,10 @@
...
@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include "skinnyaead.h"
#include <string.h>
#include <string.h>
#include <stdio.h>
/******************************************************************************
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
* x ^= y where x, y are 128-bit blocks (16 bytes array).
...
@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u8
feedback
;
u8
feedback
;
u8
tmp
[
2
*
BLOCKBYTES
];
u8
tmp
[
2
*
BLOCKBYTES
];
memset
(
tmp
,
0x00
,
2
*
BLOCKBYTES
);
memset
(
tmp
,
0x00
,
2
*
BLOCKBYTES
);
memset
(
auth
,
0x00
,
BLOCKBYTES
);
SET_DOMAIN
(
tmp
,
0x02
);
SET_DOMAIN
(
tmp
,
0x02
);
SET_DOMAIN
(
tmp
+
BLOCKBYTES
,
0x02
);
memset
(
auth
,
0x00
,
BLOCKBYTES
);
while
(
adlen
>=
2
*
BLOCKBYTES
)
{
while
(
adlen
>=
2
*
BLOCKBYTES
)
{
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
UPDATE_LFSR
(
lfsr
);
UPDATE_LFSR
(
lfsr
);
LE_STR_64
(
tmp
+
BLOCKBYTES
,
lfsr
);
LE_STR_64
(
tmp
+
BLOCKBYTES
,
lfsr
);
SET_DOMAIN
(
tmp
+
BLOCKBYTES
,
0x02
);
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
+
BLOCKBYTES
);
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
+
BLOCKBYTES
);
skinny128_384_plus_encrypt
(
tmp
,
tmp
+
BLOCKBYTES
,
ad
,
ad
+
BLOCKBYTES
,
*
tk
);
skinny128_384_plus_encrypt
(
tmp
,
tmp
+
BLOCKBYTES
,
ad
,
ad
+
BLOCKBYTES
,
*
tk
);
xor_block
(
auth
,
tmp
);
xor_block
(
auth
,
tmp
);
...
@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
adlen
-=
2
*
BLOCKBYTES
;
adlen
-=
2
*
BLOCKBYTES
;
ad
+=
2
*
BLOCKBYTES
;
ad
+=
2
*
BLOCKBYTES
;
UPDATE_LFSR
(
lfsr
);
UPDATE_LFSR
(
lfsr
);
memset
(
tmp
,
0x00
,
2
*
BLOCKBYTES
);
// to save 32 bytes of RAM
SET_DOMAIN
(
tmp
,
0x02
);
SET_DOMAIN
(
tmp
+
BLOCKBYTES
,
0x02
);
}
}
if
(
adlen
>
BLOCKBYTES
)
{
// pad and process 2 blocs in //
if
(
adlen
>
BLOCKBYTES
)
{
// pad and process 2 blocs in //
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
...
@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
skinny128_384_plus_encrypt
(
auth
,
c
,
ad
,
c
,
*
tk
);
skinny128_384_plus_encrypt
(
tmp
,
c
,
ad
,
c
,
*
tk
);
}
else
{
// if tag has been calculated yet
}
else
{
// if tag has been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
skinny128_384_plus_encrypt
(
auth
,
auth
,
ad
,
ad
,
*
tk
);
skinny128_384_plus_encrypt
(
tmp
,
tmp
,
ad
,
ad
,
*
tk
);
}
}
xor_block
(
auth
,
tmp
);
}
else
if
(
adlen
>
0
)
{
}
else
if
(
adlen
>
0
)
{
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
SET_DOMAIN
(
tmp
,
0x03
);
// domain for padding ad
SET_DOMAIN
(
tmp
,
0x03
);
// domain for padding ad
...
@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
tmp
[
BLOCKBYTES
+
adlen
]
^=
0x80
;
// padding
tmp
[
BLOCKBYTES
+
adlen
]
^=
0x80
;
// padding
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
skinny128_384_plus_encrypt
(
auth
,
c
,
tmp
+
BLOCKBYTES
,
c
,
*
tk
);
skinny128_384_plus_encrypt
(
tmp
,
c
,
tmp
+
BLOCKBYTES
,
c
,
*
tk
);
}
else
{
// if tag has been calculated yet
}
else
{
// if tag has been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
skinny128_384_plus_encrypt
(
auth
,
auth
,
tmp
+
BLOCKBYTES
,
tmp
+
BLOCKBYTES
,
*
tk
);
skinny128_384_plus_encrypt
(
tmp
,
tmp
,
tmp
+
BLOCKBYTES
,
tmp
+
BLOCKBYTES
,
*
tk
);
}
}
xor_block
(
auth
,
tmp
);
}
}
}
}
...
@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
...
@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
feedback
|=
sum
[
i
]
^
c
[
i
];
// constant-time tag verification
feedback
|=
sum
[
i
]
^
c
[
i
];
// constant-time tag verification
return
feedback
;
return
feedback
;
// ----------------- Process the associated data -----------------
// ----------------- Process the associated data -----------------
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.c
View file @
cad26506
...
@@ -16,12 +16,9 @@
...
@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0.
* The MixColumns operation for rounds i such that (i % 4) == 0.
...
@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
...
@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 0
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 0
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_0
(
u32
*
state
)
{
void
inv_mixcolumns_0
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
...
@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 1
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 1
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_1
(
u32
*
state
)
{
void
inv_mixcolumns_1
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
...
@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 2
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 2
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_2
(
u32
*
state
)
{
void
inv_mixcolumns_2
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
...
@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 3
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 3
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_3
(
u32
*
state
)
{
void
inv_mixcolumns_3
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
...
@@ -166,16 +163,8 @@ void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const
u8
*
ptext_bis
,
const
tweakey
tk
)
{
const
u8
*
ptext_bis
,
const
tweakey
tk
)
{
u32
state
[
8
];
u32
state
[
8
];
packing
(
state
,
ptext
,
ptext_bis
);
packing
(
state
,
ptext
,
ptext_bis
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
(
i
%
4
)
*
32
,
tk
.
rtk2_3
+
i
*
32
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
160
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
192
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
224
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
256
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
288
);
unpacking
(
ctext
,
ctext_bis
,
state
);
unpacking
(
ctext
,
ctext_bis
,
state
);
}
}
...
@@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
...
@@ -188,15 +177,7 @@ void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const
u8
*
ctext_bis
,
const
tweakey
tk
)
{
const
u8
*
ctext_bis
,
const
tweakey
tk
)
{
u32
state
[
8
];
u32
state
[
8
];
packing
(
state
,
ctext
,
ctext_bis
);
packing
(
state
,
ctext
,
ctext_bis
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
288
);
for
(
int
i
=
9
;
i
>=
0
;
i
--
)
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
256
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
(
i
%
4
)
*
32
,
tk
.
rtk2_3
+
i
*
32
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
224
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
192
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
160
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
128
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
96
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
64
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
32
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
);
unpacking
(
ptext
,
ptext_bis
,
state
);
unpacking
(
ptext
,
ptext_bis
,
state
);
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinny128.h
View file @
cad26506
#ifndef SKINNY128_H_
#ifndef SKINNY128_H_
#define SKINNY128_H_
#define SKINNY128_H_
#include "tk_schedule.h"
#include "tk_schedule.h"
void
skinny128_384_plus_encrypt
(
u8
*
ctext
,
u8
*
ctext_bis
,
const
u8
*
ptext
,
void
skinny128_384_plus_encrypt
(
u8
*
ctext
,
u8
*
ctext_bis
,
const
u8
*
ptext
,
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/skinnyaead.h
View file @
cad26506
...
@@ -3,9 +3,7 @@
...
@@ -3,9 +3,7 @@
#include "skinny128.h"
#include "skinny128.h"
typedef
unsigned
char
u8
;
typedef
uint64_t
u64
;
typedef
unsigned
int
u32
;
typedef
unsigned
long
long
u64
;
#define TAGBYTES 16
#define TAGBYTES 16
#define KEYBYTES 16
#define KEYBYTES 16
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.c
View file @
cad26506
...
@@ -7,15 +7,11 @@
...
@@ -7,15 +7,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
*******************************************************************************/
*******************************************************************************/
#include <stdio.h>
#include <string.h>
#include <string.h>
#include "tk_schedule.h"
#include "tk_schedule.h"
typedef
unsigned
char
u8
;
typedef
unsigned
int
u32
;
/****************************************************************************
/****************************************************************************
* The round constants according to the fixsliced representation.
* The round constants according to the fixsliced representation.
****************************************************************************/
****************************************************************************/
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128+v1/opt32_2/tk_schedule.h
View file @
cad26506
#ifndef TK_SCHEDULE_BS_H_
#ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
typedef
unsigned
char
u8
;
#include <stdint.h>
typedef
unsigned
int
u32
;
typedef
uint8_t
u8
;
typedef
uint32_t
u32
;
typedef
struct
{
typedef
struct
{
u32
rtk1
[
8
*
16
];
u32
rtk1
[
8
*
16
];
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/encrypt.c
View file @
cad26506
/******************************************************************************
/******************************************************************************
* Constant-time implementation of SKINNY-AEAD-M1 (v1.1).
* Constant-time implementation of SKINNY-AEAD-M1(v1).
*
* Two blocks are treated in parallel with SKINNY-128-384 whenever possible.
*
*
* For more details, see the paper at: https://
* For more details, see the paper at: https://
*
*
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include "skinnyaead.h"
#include <string.h>
#include <string.h>
#include <stdio.h>
/******************************************************************************
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
* x ^= y where x, y are 128-bit blocks (16 bytes array).
...
@@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
...
@@ -75,12 +71,6 @@ int crypto_aead_encrypt (unsigned char *c, unsigned long long *clen,
}
}
LE_STR_64
(
tmp
,
lfsr
);
// lfsr for tag computation
LE_STR_64
(
tmp
,
lfsr
);
// lfsr for tag computation
precompute_rtk1
(
rtk1
,
tmp
);
precompute_rtk1
(
rtk1
,
tmp
);
for
(
int
i
=
0
;
i
<
16
;
i
++
)
{
printf
(
"%08x %08x %08x %08x
\n
"
,
rtk1
[
i
*
4
],
rtk1
[
i
*
4
+
1
],
rtk1
[
i
*
4
+
2
],
rtk1
[
i
*
4
+
3
]);
}
for
(
int
i
=
0
;
i
<
56
;
i
++
)
{
printf
(
"%08x %08x %08x %08x
\n
"
,
rtk2_3
[
i
*
4
],
rtk2_3
[
i
*
4
+
1
],
rtk2_3
[
i
*
4
+
2
],
rtk2_3
[
i
*
4
+
3
]);
}
skinny128_384_encrypt
(
c
,
c
,
rtk1
,
rtk2_3
);
// compute the tag
skinny128_384_encrypt
(
c
,
c
,
rtk1
,
rtk2_3
);
// compute the tag
// ----------------- Process the plaintext -----------------
// ----------------- Process the plaintext -----------------
...
@@ -200,4 +190,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
...
@@ -200,4 +190,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
feedback
|=
sum
[
i
]
^
c
[
i
];
// constant-time tag verification
feedback
|=
sum
[
i
]
^
c
[
i
];
// constant-time tag verification
return
feedback
;
return
feedback
;
// ----------------- Process the associated data -----------------
// ----------------- Process the associated data -----------------
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinny128.c
View file @
cad26506
...
@@ -16,12 +16,9 @@
...
@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "skinny128.h"
#include "tk_schedule.h"
/******************************************************************************
/******************************************************************************
* The MixColumns computation for rounds i such that (i % 4) == 0
* The MixColumns computation for rounds i such that (i % 4) == 0
...
@@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
...
@@ -153,20 +150,8 @@ void skinny128_384_encrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
for
(
int
i
=
0
;
i
<
14
;
i
++
)
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
160
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
176
);
QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
192
);
QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
208
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
...
@@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
...
@@ -180,19 +165,7 @@ void skinny128_384_decrypt(u8* ctext, const u8* ptext, const u32* rtk1,
u32
tmp
;
// used in SWAPMOVE macro
u32
tmp
;
// used in SWAPMOVE macro
u32
state
[
4
];
// 128-bit state
u32
state
[
4
];
// 128-bit state
packing
(
state
,
ptext
);
// from byte to bitsliced representation
packing
(
state
,
ptext
);
// from byte to bitsliced representation
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
208
);
for
(
int
i
=
13
;
i
>=
0
;
i
--
)
INV_QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
192
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
(
i
%
4
)
*
16
,
rtk2_3
+
i
*
16
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
176
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
160
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
144
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
128
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
112
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
96
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
80
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
+
64
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
48
,
rtk2_3
+
48
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
32
,
rtk2_3
+
32
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
+
16
,
rtk2_3
+
16
);
INV_QUADRUPLE_ROUND
(
state
,
rtk1
,
rtk2_3
);
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
unpacking
(
ctext
,
state
);
// from bitsliced to byte representation
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/skinnyaead.h
View file @
cad26506
...
@@ -3,9 +3,7 @@
...
@@ -3,9 +3,7 @@
#include "skinny128.h"
#include "skinny128.h"
typedef
unsigned
char
u8
;
typedef
uint64_t
u64
;
typedef
unsigned
int
u32
;
typedef
unsigned
long
long
u64
;
#define TAGBYTES 16
#define TAGBYTES 16
#define KEYBYTES 16
#define KEYBYTES 16
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.c
View file @
cad26506
...
@@ -4,16 +4,11 @@
...
@@ -4,16 +4,11 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include <string.h> //for memcmp
#include "tk_schedule.h"
#include "skinny128.h"
#include "skinny128.h"
typedef
unsigned
char
u8
;
typedef
unsigned
int
u32
;
/******************************************************************************
/******************************************************************************
* The round constants according to the new representation.
* The round constants according to the new representation.
******************************************************************************/
******************************************************************************/
...
@@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
...
@@ -271,6 +266,70 @@ void precompute_lfsr_tk3(u32* tk, const u8* key, const int rounds) {
}
}
/******************************************************************************
/******************************************************************************
* Precompute LFSR2(TK2) ^ LFSR3(TK3) for all round tweakeys.
* It is equivalent to the following 2 function calls:
* - precompute_lfsr_tk2(tk, t2, SKINNY128_384_ROUNDS);
* - precompute_lfsr_tk3(tk, t3, SKINNY128_384_ROUNDS);
* However 'precompute_lfsr_tk2_3' can allow to save cycles on some platform.
* On ARMv7 one should observe a gain of ~1k cycles per function call. It can be
* explained by the fact that less memory accesses to 'tk' are computed.
*
* To save some code size, the loop can be replaced by the following one:
* for(int i = 0 ; i < rounds; i+=2) {
* lfsr2_bs(tk2);
* lfsr3_bs(tk3);
* tk[i*4+4] = tk2[0] ^ tk3[0];
* tk[i*4+5] = tk2[1] ^ tk3[1];
* tk[i*4+6] = tk2[2] ^ tk3[2];
* tk[i*4+7] = tk2[3] ^ tk3[3];
* }
* at the cost of some cycles (~260 on ARM Cortex-M).
******************************************************************************/
void
precompute_lfsr_tk2_3
(
u32
*
tk
,
const
u8
*
t2
,
const
u8
*
t3
,
const
int
rounds
)
{
u32
tk2
[
4
],
tk3
[
4
];
packing
(
tk2
,
t2
);
packing
(
tk3
,
t3
);
tk
[
0
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
1
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
2
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
3
]
=
tk2
[
3
]
^
tk3
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
tk2
[
0
]
^=
(
tk2
[
2
]
&
0xaaaaaaaa
);
tk2
[
0
]
=
((
tk2
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
3
]
^=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
3
]
=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
4
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
5
]
=
tk2
[
2
]
^
tk3
[
0
];
tk
[
i
*
4
+
6
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
7
]
=
tk2
[
0
]
^
tk3
[
2
];
tk2
[
1
]
^=
(
tk2
[
3
]
&
0xaaaaaaaa
);
tk2
[
1
]
=
((
tk2
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
2
]
^=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
2
]
=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
12
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
13
]
=
tk2
[
3
]
^
tk3
[
3
];
tk
[
i
*
4
+
14
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
15
]
=
tk2
[
1
]
^
tk3
[
1
];
tk2
[
2
]
^=
(
tk2
[
0
]
&
0xaaaaaaaa
);
tk2
[
2
]
=
((
tk2
[
2
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
2
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
1
]
^=
((
tk3
[
3
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
1
]
=
((
tk3
[
1
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
1
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
20
]
=
tk2
[
3
]
^
tk3
[
1
];
tk
[
i
*
4
+
21
]
=
tk2
[
0
]
^
tk3
[
2
];
tk
[
i
*
4
+
22
]
=
tk2
[
1
]
^
tk3
[
3
];
tk
[
i
*
4
+
23
]
=
tk2
[
2
]
^
tk3
[
0
];
tk2
[
3
]
^=
(
tk2
[
1
]
&
0xaaaaaaaa
);
tk2
[
3
]
=
((
tk2
[
3
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk2
[
3
]
<<
1
)
&
0xaaaaaaaa
);
tk3
[
0
]
^=
((
tk3
[
2
]
&
0xaaaaaaaa
)
>>
1
);
tk3
[
0
]
=
((
tk3
[
0
]
&
0xaaaaaaaa
)
>>
1
)
|
((
tk3
[
0
]
<<
1
)
&
0xaaaaaaaa
);
tk
[
i
*
4
+
28
]
=
tk2
[
0
]
^
tk3
[
0
];
tk
[
i
*
4
+
29
]
=
tk2
[
1
]
^
tk3
[
1
];
tk
[
i
*
4
+
30
]
=
tk2
[
2
]
^
tk3
[
2
];
tk
[
i
*
4
+
31
]
=
tk2
[
3
]
^
tk3
[
3
];
}
}
/******************************************************************************
* XOR TK with TK1 before applying the permutations.
* XOR TK with TK1 before applying the permutations.
* The key is then rearranged to match the barrel shiftrows representation.
* The key is then rearranged to match the barrel shiftrows representation.
******************************************************************************/
******************************************************************************/
...
@@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -278,19 +337,20 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
u32
test
;
u32
test
;
u32
tk1
[
4
],
tmp
[
4
];
u32
tk1
[
4
],
tmp
[
4
];
packing
(
tk1
,
key
);
packing
(
tk1
,
key
);
memcpy
(
tmp
,
tk
,
16
);
tmp
[
0
]
=
tk
[
0
]
^
tk1
[
0
];
tmp
[
0
]
^=
tk1
[
0
];
tmp
[
1
]
=
tk
[
1
]
^
tk1
[
1
];
tmp
[
1
]
^=
tk1
[
1
];
tmp
[
2
]
=
tk
[
2
]
^
tk1
[
2
];
tmp
[
2
]
^=
tk1
[
2
];
tmp
[
3
]
=
tk
[
3
]
^
tk1
[
3
];
tmp
[
3
]
^=
tk1
[
3
];
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
for
(
int
i
=
0
;
i
<
rounds
;
i
+=
8
)
{
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
test
=
(
i
%
16
<
8
)
?
1
:
0
;
//to apply the right power of P
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
]
=
tmp
[
2
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
1
]
=
tmp
[
3
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
2
]
=
tmp
[
0
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
tk
[
i
*
4
+
3
]
=
tmp
[
1
]
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
4
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
4
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
5
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
6
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
7
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_2
(
tmp
);
// applies P^2
permute_tk_2
(
tmp
);
// applies P^2
else
else
...
@@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -307,8 +367,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
10
]
|=
ROR
(
tmp
[
0
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
=
ROR
(
tmp
[
1
],
28
)
&
0x03030303
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
11
]
|=
ROR
(
tmp
[
1
],
12
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
12
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
12
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
13
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
14
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
15
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_4
(
tmp
);
// applies P^4
permute_tk_4
(
tmp
);
// applies P^4
else
else
...
@@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -321,8 +383,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
17
]
=
ROR
(
tmp
[
3
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
18
]
=
ROR
(
tmp
[
0
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
tk
[
i
*
4
+
19
]
=
ROR
(
tmp
[
1
],
16
)
&
0xf0f0f0f0
;
memcpy
(
tmp
,
tk
+
i
*
4
+
20
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
20
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
21
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
22
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
23
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_6
(
tmp
);
// applies P^6
permute_tk_6
(
tmp
);
// applies P^6
else
else
...
@@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -339,8 +403,10 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
26
]
|=
ROR
(
tmp
[
0
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
=
ROR
(
tmp
[
1
],
12
)
&
0x03030303
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
tk
[
i
*
4
+
27
]
|=
ROR
(
tmp
[
1
],
28
)
&
0x0c0c0c0c
;
memcpy
(
tmp
,
tk
+
i
*
4
+
28
,
16
);
tmp
[
0
]
=
tk
[
i
*
4
+
28
]
^
tk1
[
0
];
XOR_BLOCKS
(
tmp
,
tk1
);
tmp
[
1
]
=
tk
[
i
*
4
+
29
]
^
tk1
[
1
];
tmp
[
2
]
=
tk
[
i
*
4
+
30
]
^
tk1
[
2
];
tmp
[
3
]
=
tk
[
i
*
4
+
31
]
^
tk1
[
3
];
if
(
test
)
if
(
test
)
permute_tk_8
(
tmp
);
// applies P^8
permute_tk_8
(
tmp
);
// applies P^8
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
for
(
int
j
=
0
;
j
<
4
;
j
++
)
{
...
@@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
...
@@ -361,8 +427,7 @@ void permute_tk(u32* tk, const u8* key, const int rounds) {
******************************************************************************/
******************************************************************************/
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
)
{
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
memset
(
rtk
,
0x00
,
16
*
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2
(
rtk
,
tk2
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk2_3
(
rtk
,
tk2
,
tk3
,
SKINNY128_384_ROUNDS
);
precompute_lfsr_tk3
(
rtk
,
tk3
,
SKINNY128_384_ROUNDS
);
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
permute_tk
(
rtk
,
(
u8
*
)(
rtk
+
8
),
SKINNY128_384_ROUNDS
);
// rtk+8 is NULL
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
i
=
0
;
i
<
SKINNY128_384_ROUNDS
;
i
++
)
{
// add rconsts
for
(
int
j
=
0
;
j
<
4
;
j
++
)
for
(
int
j
=
0
;
j
<
4
;
j
++
)
...
@@ -376,4 +441,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
...
@@ -376,4 +441,4 @@ void precompute_rtk2_3(u32* rtk, const u8* tk2, const u8 * tk3) {
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
)
{
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
)
{
memset
(
rtk1
,
0x00
,
16
*
16
);
memset
(
rtk1
,
0x00
,
16
*
16
);
permute_tk
(
rtk1
,
tk1
,
16
);
permute_tk
(
rtk1
,
tk1
,
16
);
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_1/tk_schedule.h
View file @
cad26506
#ifndef TK_SCHEDULE_H_
#ifndef TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
#define TK_SCHEDULE_H_
typedef
unsigned
char
u8
;
#include <stdint.h>
typedef
unsigned
int
u32
;
typedef
uint8_t
u8
;
typedef
uint32_t
u32
;
void
packing
(
u32
*
out
,
const
u8
*
in
);
void
packing
(
u32
*
out
,
const
u8
*
in
);
void
unpacking
(
u8
*
out
,
u32
*
in
);
void
unpacking
(
u8
*
out
,
u32
*
in
);
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
);
void
precompute_rtk2_3
(
u32
*
rtk
,
const
u8
*
tk2
,
const
u8
*
tk3
);
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
);
void
precompute_rtk1
(
u32
*
rtk1
,
const
u8
*
tk1
);
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define ROR(x,y) (((x) >> (y)) | ((x) << (32 - (y))))
#define XOR_BLOCKS(x,y) ({ \
(x)[0] ^= (y)[0]; \
(x)[1] ^= (y)[1]; \
(x)[2] ^= (y)[2]; \
(x)[3] ^= (y)[3]; \
})
#define SWAPMOVE(a, b, mask, n) ({ \
#define SWAPMOVE(a, b, mask, n) ({ \
tmp = (b ^ (a >> n)) & mask; \
tmp = (b ^ (a >> n)) & mask; \
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/encrypt.c
View file @
cad26506
...
@@ -8,12 +8,10 @@
...
@@ -8,12 +8,10 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include "skinny128.h"
#include "skinnyaead.h"
#include "skinnyaead.h"
#include <string.h>
#include <string.h>
#include <stdio.h>
/******************************************************************************
/******************************************************************************
* x ^= y where x, y are 128-bit blocks (16 bytes array).
* x ^= y where x, y are 128-bit blocks (16 bytes array).
...
@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -33,13 +31,13 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
u8
feedback
;
u8
feedback
;
u8
tmp
[
2
*
BLOCKBYTES
];
u8
tmp
[
2
*
BLOCKBYTES
];
memset
(
tmp
,
0x00
,
2
*
BLOCKBYTES
);
memset
(
tmp
,
0x00
,
2
*
BLOCKBYTES
);
memset
(
auth
,
0x00
,
BLOCKBYTES
);
SET_DOMAIN
(
tmp
,
0x02
);
SET_DOMAIN
(
tmp
,
0x02
);
SET_DOMAIN
(
tmp
+
BLOCKBYTES
,
0x02
);
memset
(
auth
,
0x00
,
BLOCKBYTES
);
while
(
adlen
>=
2
*
BLOCKBYTES
)
{
while
(
adlen
>=
2
*
BLOCKBYTES
)
{
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
UPDATE_LFSR
(
lfsr
);
UPDATE_LFSR
(
lfsr
);
LE_STR_64
(
tmp
+
BLOCKBYTES
,
lfsr
);
LE_STR_64
(
tmp
+
BLOCKBYTES
,
lfsr
);
SET_DOMAIN
(
tmp
+
BLOCKBYTES
,
0x02
);
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
+
BLOCKBYTES
);
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
+
BLOCKBYTES
);
skinny128_384_encrypt
(
tmp
,
tmp
+
BLOCKBYTES
,
ad
,
ad
+
BLOCKBYTES
,
*
tk
);
skinny128_384_encrypt
(
tmp
,
tmp
+
BLOCKBYTES
,
ad
,
ad
+
BLOCKBYTES
,
*
tk
);
xor_block
(
auth
,
tmp
);
xor_block
(
auth
,
tmp
);
...
@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -47,6 +45,9 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
adlen
-=
2
*
BLOCKBYTES
;
adlen
-=
2
*
BLOCKBYTES
;
ad
+=
2
*
BLOCKBYTES
;
ad
+=
2
*
BLOCKBYTES
;
UPDATE_LFSR
(
lfsr
);
UPDATE_LFSR
(
lfsr
);
memset
(
tmp
,
0x00
,
2
*
BLOCKBYTES
);
// to save 32 bytes of RAM
SET_DOMAIN
(
tmp
,
0x02
);
SET_DOMAIN
(
tmp
+
BLOCKBYTES
,
0x02
);
}
}
if
(
adlen
>
BLOCKBYTES
)
{
// pad and process 2 blocs in //
if
(
adlen
>
BLOCKBYTES
)
{
// pad and process 2 blocs in //
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
...
@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -65,11 +66,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
skinny128_384_encrypt
(
auth
,
c
,
ad
,
c
,
*
tk
);
skinny128_384_encrypt
(
tmp
,
c
,
ad
,
c
,
*
tk
);
}
else
{
// if tag has been calculated yet
}
else
{
// if tag has been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
skinny128_384_encrypt
(
auth
,
auth
,
ad
,
ad
,
*
tk
);
skinny128_384_encrypt
(
tmp
,
tmp
,
ad
,
ad
,
*
tk
);
}
}
xor_block
(
auth
,
tmp
);
}
else
if
(
adlen
>
0
)
{
}
else
if
(
adlen
>
0
)
{
LE_STR_64
(
tmp
,
lfsr
);
LE_STR_64
(
tmp
,
lfsr
);
SET_DOMAIN
(
tmp
,
0x03
);
// domain for padding ad
SET_DOMAIN
(
tmp
,
0x03
);
// domain for padding ad
...
@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
...
@@ -78,11 +80,12 @@ static void skinny_aead_m1_auth(u8* auth, u8* c, u8* tag, tweakey* tk,
tmp
[
BLOCKBYTES
+
adlen
]
^=
0x80
;
// padding
tmp
[
BLOCKBYTES
+
adlen
]
^=
0x80
;
// padding
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
if
(
mlen
==
0
)
{
// if tag has *NOT* been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tag
);
// compute the tag
skinny128_384_encrypt
(
auth
,
c
,
tmp
+
BLOCKBYTES
,
c
,
*
tk
);
skinny128_384_encrypt
(
tmp
,
c
,
tmp
+
BLOCKBYTES
,
c
,
*
tk
);
}
else
{
// if tag has been calculated yet
}
else
{
// if tag has been calculated yet
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
precompute_rtk1
(
tk
->
rtk1
,
tmp
,
tmp
);
// process last ad block
skinny128_384_encrypt
(
auth
,
auth
,
tmp
+
BLOCKBYTES
,
tmp
+
BLOCKBYTES
,
*
tk
);
skinny128_384_encrypt
(
tmp
,
tmp
,
tmp
+
BLOCKBYTES
,
tmp
+
BLOCKBYTES
,
*
tk
);
}
}
xor_block
(
auth
,
tmp
);
}
}
}
}
...
@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
...
@@ -290,4 +293,4 @@ int crypto_aead_decrypt (unsigned char *m, unsigned long long *mlen,
feedback
|=
sum
[
i
]
^
c
[
i
];
// constant-time tag verification
feedback
|=
sum
[
i
]
^
c
[
i
];
// constant-time tag verification
return
feedback
;
return
feedback
;
// ----------------- Process the associated data -----------------
// ----------------- Process the associated data -----------------
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinny128.c
View file @
cad26506
...
@@ -16,12 +16,9 @@
...
@@ -16,12 +16,9 @@
* @author Alexandre Adomnicai, Nanyang Technological University,
* @author Alexandre Adomnicai, Nanyang Technological University,
* alexandre.adomnicai@ntu.edu.sg
* alexandre.adomnicai@ntu.edu.sg
*
*
* @date
May
2020
* @date
June
2020
******************************************************************************/
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "skinny128.h"
#include "tk_schedule.h"
/****************************************************************************
/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0.
* The MixColumns operation for rounds i such that (i % 4) == 0.
...
@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
...
@@ -84,7 +81,7 @@ void mixcolumns_3(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 0
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 0
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_0
(
u32
*
state
)
{
void
inv_mixcolumns_0
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
...
@@ -99,7 +96,7 @@ void inv_mixcolumns_0(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 1
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 1
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_1
(
u32
*
state
)
{
void
inv_mixcolumns_1
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
...
@@ -114,7 +111,7 @@ void inv_mixcolumns_1(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 2
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 2
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_2
(
u32
*
state
)
{
void
inv_mixcolumns_2
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
...
@@ -129,7 +126,7 @@ void inv_mixcolumns_2(u32* state) {
}
}
/****************************************************************************
/****************************************************************************
* The inverse MixColumns o
ep
ration for rounds i such that (i % 4) == 3
* The inverse MixColumns o
pe
ration for rounds i such that (i % 4) == 3
****************************************************************************/
****************************************************************************/
void
inv_mixcolumns_3
(
u32
*
state
)
{
void
inv_mixcolumns_3
(
u32
*
state
)
{
u32
tmp
;
u32
tmp
;
...
@@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
...
@@ -166,20 +163,8 @@ void skinny128_384_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext,
const
u8
*
ptext_bis
,
const
tweakey
tk
)
{
const
u8
*
ptext_bis
,
const
tweakey
tk
)
{
u32
state
[
8
];
u32
state
[
8
];
packing
(
state
,
ptext
,
ptext_bis
);
packing
(
state
,
ptext
,
ptext_bis
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
);
for
(
int
i
=
0
;
i
<
14
;
i
++
)
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
32
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
(
i
%
4
)
*
32
,
tk
.
rtk2_3
+
i
*
32
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
64
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
96
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
128
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
160
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
192
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
224
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
256
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
288
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
320
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
352
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
384
);
QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
416
);
unpacking
(
ctext
,
ctext_bis
,
state
);
unpacking
(
ctext
,
ctext_bis
,
state
);
}
}
...
@@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
...
@@ -192,19 +177,7 @@ void skinny128_384_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext,
const
u8
*
ctext_bis
,
const
tweakey
tk
)
{
const
u8
*
ctext_bis
,
const
tweakey
tk
)
{
u32
state
[
8
];
u32
state
[
8
];
packing
(
state
,
ctext
,
ctext_bis
);
packing
(
state
,
ctext
,
ctext_bis
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
416
);
for
(
int
i
=
13
;
i
>=
0
;
i
--
)
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
384
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
(
i
%
4
)
*
32
,
tk
.
rtk2_3
+
i
*
32
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
352
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
320
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
288
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
256
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
224
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
192
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
160
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
+
128
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
96
,
tk
.
rtk2_3
+
96
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
64
,
tk
.
rtk2_3
+
64
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
+
32
,
tk
.
rtk2_3
+
32
);
INV_QUADRUPLE_ROUND
(
state
,
tk
.
rtk1
,
tk
.
rtk2_3
);
unpacking
(
ptext
,
ptext_bis
,
state
);
unpacking
(
ptext
,
ptext_bis
,
state
);
}
}
\ No newline at end of file
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/skinnyaead.h
View file @
cad26506
...
@@ -3,9 +3,7 @@
...
@@ -3,9 +3,7 @@
#include "skinny128.h"
#include "skinny128.h"
typedef
unsigned
char
u8
;
typedef
uint64_t
u64
;
typedef
unsigned
int
u32
;
typedef
unsigned
long
long
u64
;
#define TAGBYTES 16
#define TAGBYTES 16
#define KEYBYTES 16
#define KEYBYTES 16
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.c
View file @
cad26506
...
@@ -9,13 +9,9 @@
...
@@ -9,13 +9,9 @@
*
*
* @date May 2020
* @date May 2020
*******************************************************************************/
*******************************************************************************/
#include <stdio.h>
#include <string.h>
#include <string.h>
#include "tk_schedule.h"
#include "tk_schedule.h"
typedef
unsigned
char
u8
;
typedef
unsigned
int
u32
;
/****************************************************************************
/****************************************************************************
* The round constants according to the fixsliced representation.
* The round constants according to the fixsliced representation.
****************************************************************************/
****************************************************************************/
...
...
skinny/Implementations/crypto_aead/skinnyaeadtk3128128v1/opt32_2/tk_schedule.h
View file @
cad26506
#ifndef TK_SCHEDULE_BS_H_
#ifndef TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
#define TK_SCHEDULE_BS_H_
typedef
unsigned
char
u8
;
#include <stdint.h>
typedef
unsigned
int
u32
;
typedef
uint8_t
u8
;
typedef
uint32_t
u32
;
typedef
struct
{
typedef
struct
{
u32
rtk1
[
8
*
16
];
u32
rtk1
[
8
*
16
];
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment