diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/api.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/api.h new file mode 100644 index 0000000..a4aa567 --- /dev/null +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/api.h @@ -0,0 +1,5 @@ +#define CRYPTO_KEYBYTES 16 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 16 +#define CRYPTO_ABYTES 16 +#define CRYPTO_NOOVERLAP 1 diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/bytes_utils.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/bytes_utils.h new file mode 100644 index 0000000..3e8bfee --- /dev/null +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/bytes_utils.h @@ -0,0 +1,129 @@ + +#ifndef __BYTES_UTILS_H__ +#define __BYTES_UTILS_H__ + +#include +#include +#include + +//#ifndef bytes_utiles_printf +//#define bytes_utiles_printf printf +//#endif +#ifndef bytes_utiles_printf +#define bytes_utiles_printf printf +#endif + +//replace 0 by . +static void print_diff_byte(uint8_t d, const char *sep){ + unsigned int n=d>>4; + if(0==n) bytes_utiles_printf("."); else bytes_utiles_printf("%X",n); + n = d & 0xF; + if(0==n) bytes_utiles_printf("."); else bytes_utiles_printf("%X",n); + bytes_utiles_printf("%s",sep); +} +static void print_diff_bytes_sep(const char *msg,const void *vbuf, unsigned int size, const char *m2, const char *sep){ + const uint8_t*const buf = (const uint8_t*const)vbuf; + bytes_utiles_printf("%s",msg); + if(size){ + unsigned int i; + for(i=0;i(len/2)) + dst_size = (len/2); + memset(dst,0,dst_size); + for(unsigned int i=0;i +typedef uint64_t DRYSPONGE_EXT_t; + +#define DRYSPONGE_EXT + +#include "drysponge_common.h" + +//input width for one round of MixPhaseRound +#define DRYSPONGE_MPR_INPUT_MASK ((((uint64_t)1)<16 + #error "DRYSPONGE_XSIZE32>16" +#endif + +#if DRYSPONGE_XSIZE32 == 4 + #define DRYSPONGE_X_IDX_WIDTH 2 +#endif + +#if DRYSPONGE_MPR_INPUT_WIDTH == 10 + #define DRYSPONGE_RANK_BYTES 2 + typedef uint32_t permut_rank_t; +#endif +#if DRYSPONGE_MPR_INPUT_WIDTH == 18 + #define DRYSPONGE_RANK_BYTES 3 + typedef uint32_t permut_rank_t; +#endif + +#define DRYSPONGE_X_IDX_MASK ((1<>shift) & DRYSPONGE_MPR_INPUT_MASK; + r^=ext; + for(unsigned int j=0;j> DRYSPONGE_X_IDX_WIDTH; + c64[j]^=x32[i]; + } +} +#endif + +struct DRYSPONGE_struct_t; +typedef struct DRYSPONGE_struct_t DRYSPONGE_t ; +DRYSPONGE_FUNC void DRYSPONGE_MixPhase( + DRYSPONGE_t *const ctx, + const uint8_t *const in +); +DRYSPONGE_FUNC void DRYSPONGE_CoreRound( + DRYSPONGE_t *const ctx, + unsigned int r +); + +#include "drysponge_le32.h" + +#ifndef DRYSPONGE_OPT_F +DRYSPONGE_FUNC void DRYSPONGE_MixPhase( + DRYSPONGE_t *const ctx, + const uint8_t *const in +){ + unsigned int bitidx=0; + #if DRYSPONGE_MPR_ROUNDS > 1 + for(unsigned int i=0;i= 4 + printf("Mix phase MixPhaseRound entry %lu:\n",i); + DRYSPONGE_print_state(ctx); + #endif + DRYSPONGE_EXT_t ext=0; + #if ((DRYSPONGE_MPR_ROUNDS-1)*(DRYSPONGE_MPR_INPUT_WIDTH))>(DRYSPONGE_BLOCKSIZE*8) + if((ctx->ext) && (i==(DRYSPONGE_MPR_ROUNDS-2))){ + //DS info is split accross this block and the last one + ext = ctx->ext; + ctx->ext = ctx->ext >> ((DRYSPONGE_BLOCKSIZE*8)%DRYSPONGE_MPR_INPUT_WIDTH); + ctx->ext = ctx->ext >> ((((DRYSPONGE_MPR_ROUNDS-1)*DRYSPONGE_MPR_INPUT_WIDTH))-(DRYSPONGE_BLOCKSIZE*8)); + } + #endif + DRYSPONGE_MixPhaseRound(ext,ctx->c,ctx->x,in,bitidx,DRYSPONGE_BLOCKSIZE); + bitidx+=DRYSPONGE_MPR_INPUT_WIDTH; + #if DRYSPONGE_DBG_EN >= 4 + printf("Mix phase CoreRound entry %lu:\n",i); + DRYSPONGE_print_state(ctx); + #endif + DRYSPONGE_CoreRound(ctx,0); + } + #endif + #if DRYSPONGE_DBG_EN >= 4 + printf("Mix phase MixPhaseRound entry %lu:\n",DRYSPONGE_MPR_ROUNDS-1); + DRYSPONGE_print_state(ctx); + #endif + DRYSPONGE_MixPhaseRound(ctx->ext,ctx->c,ctx->x,in,bitidx,DRYSPONGE_BLOCKSIZE); + ctx->ext=0; +} +#endif + +//#ifndef DRYSPONGE_OPT_G //keep for now, needed for key init +DRYSPONGE_FUNC void gascon_sboxes(uint64_t * const x, unsigned int nw){ + uint64_t t[DRYSPONGE_CAPACITYSIZE64]; + const unsigned int mid = nw/2; + for(unsigned int i=0;i 5 + x[5] ^= gascon_rotr64_interleaved(x[5], 31) ^ gascon_rotr64_interleaved(x[5], 26); + x[6] ^= gascon_rotr64_interleaved(x[6], 53) ^ gascon_rotr64_interleaved(x[6], 58); + x[7] ^= gascon_rotr64_interleaved(x[7], 9) ^ gascon_rotr64_interleaved(x[7], 46); + x[8] ^= gascon_rotr64_interleaved(x[8], 43) ^ gascon_rotr64_interleaved(x[8], 50); + #endif +} +DRYSPONGE_FUNC void gascon_permutation_round(uint64_t* S, unsigned int round) { + (void)DRYSPONGE_rotr64; + // addition of round constant + gascon_add_cst(S, round); + // substitution layer + gascon_sboxes(S,DRYSPONGE_CAPACITYSIZE64); + // linear diffusion layer + gascon_lin_layer(S); +} + +DRYSPONGE_FUNC void DRYSPONGE_CoreRound( + DRYSPONGE_t *const ctx, + unsigned int r +){ + gascon_permutation_round(ctx->c, r); +} + +#endif diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge.h new file mode 100644 index 0000000..a351347 --- /dev/null +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge.h @@ -0,0 +1 @@ +#include "drygascon128_le32.h" diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_common.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_common.h new file mode 100644 index 0000000..36792aa --- /dev/null +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_common.h @@ -0,0 +1,141 @@ +#ifndef __DRYSPONGE_COMMON_H__ +#define __DRYSPONGE_COMMON_H__ + +#ifndef DRYSPONGE_FUNC +#define DRYSPONGE_FUNC inline static +#endif + +//convention: +// width means length in bits +// size means length in bytes + +#include +#include +#include + +#if DRYSPONGE_DBG_EN +#include "bytes_utils.h" +#endif + +#define DRYSPONGE_PASS 0 + +#define DRYSPONGE_DS 2 +#define DRYSPONGE_DD 1 +#define DRYSPONGE_DA 2 +#define DRYSPONGE_DM 3 + +#define DRYSPONGE_STATESIZE (DRYSPONGE_CAPACITYSIZE+DRYSPONGE_BLOCKSIZE) +#define DRYSPONGE_DIGESTSIZE (DRYSPONGE_KEYSIZE*2) +#define DRYSPONGE_TAGSIZE DRYSPONGE_KEYSIZE +#define DRYSPONGE_KEYMAXSIZE (DRYSPONGE_CAPACITYSIZE+DRYSPONGE_XSIZE) + +#define DRYSPONGE_DIVUP(a,b) (((a)+(b)-1)/(b)) +#define DRYSPONGE_ROTR32(x,n) (0xFFFFFFFF & (((x)>>(n))|((x)<<(0x1F & (32-(n)))))) +#define DRYSPONGE_ROTR64(x,n) (0xFFFFFFFFFFFFFFFF & (((x)>>(n))|((x)<<(0x3F & (64-(n)))))) + +#define DRYSPONGE_STATESIZE32 DRYSPONGE_DIVUP(DRYSPONGE_STATESIZE,4) +#define DRYSPONGE_CE_SIZE32 DRYSPONGE_DIVUP(DRYSPONGE_CE_SIZE,4) +#define DRYSPONGE_BLOCKSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_BLOCKSIZE,4) +#define DRYSPONGE_CAPACITYSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_CAPACITYSIZE,4) +#define DRYSPONGE_XSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_XSIZE,4) +#define DRYSPONGE_KEYSIZE32 DRYSPONGE_DIVUP(DRYSPONGE_KEYSIZE,4) + +#define DRYSPONGE_STATESIZE64 DRYSPONGE_DIVUP(DRYSPONGE_STATESIZE,8) +#define DRYSPONGE_CE_SIZE64 DRYSPONGE_DIVUP(DRYSPONGE_CE_SIZE,8) +#define DRYSPONGE_BLOCKSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_BLOCKSIZE,8) +#define DRYSPONGE_CAPACITYSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_CAPACITYSIZE,8) +#define DRYSPONGE_XSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_XSIZE,8) +#define DRYSPONGE_KEYSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_KEYSIZE,8) +#define DRYSPONGE_TAGSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_TAGSIZE,8) +#define DRYSPONGE_KEYMAXSIZE64 DRYSPONGE_DIVUP(DRYSPONGE_KEYMAXSIZE,8) +#define DRYSPONGE_NONCESIZE64 DRYSPONGE_DIVUP(DRYSPONGE_NONCESIZE,8) + +#if DRYSPONGE_NONCESIZE < 12 + #error "DRYSPONGE_NONCESIZE < 12" +#endif + +#if DRYSPONGE_KEYSIZE < 16 + #error "DRYSPONGE_KEYSIZE < 16" +#endif + +#if DRYSPONGE_DIGESTSIZE < 2*DRYSPONGE_KEYSIZE + #error "DRYSPONGE_DIGESTSIZE < 2*DRYSPONGE_KEYSIZE" +#endif + +#if DRYSPONGE_ACCUMULATE_FACTOR > ((DRYSPONGE_CAPACITYSIZE/4)/DRYSPONGE_BLOCKSIZE32) + #error "DRYSPONGE_ACCUMULATE_FACTOR > ((DRYSPONGE_CAPACITYSIZE/4)/DRYSPONGE_BLOCKSIZE32)" +#endif + +#ifdef DRYSPONGE_EXT +#define DRYSPONGE_EXT_ARG (&(ctx->ext)) +#else +#define DRYSPONGE_EXT_ARG 0 +#endif + +DRYSPONGE_FUNC unsigned int DRYSPONGE_DSINFO(unsigned int padded, unsigned int domain, unsigned int finalize){ + #if DRYSPONGE_DBG_EN + bytes_utiles_printf(" Adding DS: padded=%d, domain=%u, finalize=%d\n",padded,domain,finalize); + #endif + return padded+(finalize<<1)+(domain<<2); +} + +DRYSPONGE_FUNC uint32_t DRYSPONGE_rotr32(uint32_t x, unsigned int n){ + assert(n<32); + return DRYSPONGE_ROTR32(x,n); +} + +DRYSPONGE_FUNC uint64_t DRYSPONGE_rotr64(uint64_t x, unsigned int n){ + assert(n<64); + return DRYSPONGE_ROTR64(x,n); +} + +DRYSPONGE_FUNC void DRYSPONGE_xor( + const uint8_t *const a,//exactly one block of input + const uint8_t *const b, + uint8_t *const y +){ + for(unsigned int i=0;i> (8*i); + } +} + +DRYSPONGE_FUNC void DRYSPONGE_load64(uint64_t* x, uint8_t* in) { + *x = 0; + for(unsigned int i = 0;i<8;i++){ + uint64_t b = in[i]; + *x = *x | (b<<(8*i)); + } +} + +DRYSPONGE_FUNC void DRYSPONGE_store64(uint8_t* out, uint64_t x) { + (void)DRYSPONGE_rotr32; + (void)DRYSPONGE_load16; + (void)DRYSPONGE_store32; + for(unsigned int i = 0;i<8;i++){ + out[i] = x >> (8*i); + } +} + +#endif diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_dbg_support.h b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_dbg_support.h new file mode 100644 index 0000000..5b6eafc --- /dev/null +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/drysponge_dbg_support.h @@ -0,0 +1,49 @@ +#ifndef __DRYSPONGE_DBG_SUPPORT_H__ +#define __DRYSPONGE_DBG_SUPPORT_H__ + +#define DRYSPONGE_DBG_NONE 0 +#define DRYSPONGE_DBG_ALG_IO 1 +#define DRYSPONGE_DBG_F_IO 2 +#define DRYSPONGE_DBG_ROUND_IO 3 +#define DRYSPONGE_DBG_FULL 4 + +#if DRYSPONGE_DBG_EN + #define DRYSPONGE_DBG(a) a; +#else + #define DRYSPONGE_DBG(a) +#endif + + +#if DRYSPONGE_DBG_EN +#include +#include +#include "bytes_utils.h" +static void DRYSPONGE_print_state( + DRYSPONGE_t *const ctx +){ + (void)xor_bytes; + (void)println_128; + (void)bytes_utils_remove_unused_warnings; + unsigned int linesize = 32; + if(linesizec; + for(unsigned int i=0;ix; + for(unsigned int i=0;ic),ctx->rounds); +} +#else +static void DRYSPONGE_g( + DRYSPONGE_t *const ctx +){ + #if DRYSPONGE_DBG_EN + printf(" G entry %lu:\n",ctx->fcnt); + DRYSPONGE_print_state(ctx); + #endif + ctx->fcnt++; + DRYSPONGE_xor64(ctx->r,ctx->r,ctx->r);//r=0 + for(unsigned int j = 0;jrounds;j++){ + #if DRYSPONGE_DBG_EN >= DRYSPONGE_DBG_ROUND_IO + printf(" CoreRound entry %d:\n",j); + DRYSPONGE_print_state(ctx); + #endif + DRYSPONGE_CoreRound(ctx,j); + uint32_t r32[DRYSPONGE_BLOCKSIZE32]; + uint32_t cpart[DRYSPONGE_BLOCKSIZE32]; + memcpy(r32,ctx->r,sizeof(r32)); + for(unsigned int k=0;kc+k*DRYSPONGE_BLOCKSIZE64,sizeof(cpart)); + for(unsigned int i=0;ir,r32,sizeof(r32)); + } +} +#endif + +#ifdef DRYSPONGE_OPT_F +static void DRYSPONGE_DomainSeparator( + DRYSPONGE_EXT_t *const ext, + unsigned int dsinfo +){ + *ext = dsinfo; +} +void drygascon128_f(uint64_t* x, uint32_t*in,uint32_t ds,uint32_t rounds); +static void DRYSPONGE_f( + DRYSPONGE_t *const ctx, + const uint8_t *const i +){ + DRYSPONGE_OPT_F((uint64_t*)&(ctx->c),(uint32_t*)i,(uint32_t)ctx->ext,ctx->rounds); + ctx->ext=0; +} +#else +static void DRYSPONGE_f( + DRYSPONGE_t *const ctx, + const uint8_t *const i +){ + #if DRYSPONGE_DBG_EN + printf(" F entry %lu:\n",ctx->fcnt); + DRYSPONGE_print_state(ctx); + print_bytes_sep(" I = ",i,DRYSPONGE_BLOCKSIZE,"\n",""); + #endif + DRYSPONGE_MixPhase(ctx,i); + #if DRYSPONGE_DBG_EN >= DRYSPONGE_DBG_ROUND_IO + printf(" After mix phase:\n"); + DRYSPONGE_print_state(ctx); + #endif + DRYSPONGE_g(ctx); +} +#endif + +static void DRYSPONGE_set_key( + DRYSPONGE_t *const ctx, + const uint8_t *const key, + const unsigned int keylen +){ + assert(DRYSPONGE_KEYSIZE<=keylen); + const unsigned int midkeysize = DRYSPONGE_KEYSIZE+DRYSPONGE_XSIZE; + const unsigned int fullkeysize = DRYSPONGE_CAPACITYSIZE+DRYSPONGE_XSIZE; + if(DRYSPONGE_KEYSIZE!=keylen){//all words for x assumed to be different + if(fullkeysize == keylen){ + memcpy(ctx->c,key,DRYSPONGE_CAPACITYSIZE); + memcpy(ctx->x,key+DRYSPONGE_CAPACITYSIZE,DRYSPONGE_XSIZE); + } else { + uint8_t c[DRYSPONGE_CAPACITYSIZE]; + uint8_t x[DRYSPONGE_XSIZE]; + assert(midkeysize==keylen); + for(unsigned int i=0;ic,c,DRYSPONGE_CAPACITYSIZE); + memcpy(ctx->x,x,DRYSPONGE_XSIZE); + } + }else{ + uint8_t c[DRYSPONGE_CAPACITYSIZE]; + for(unsigned int i=0;ic,c,DRYSPONGE_CAPACITYSIZE); + DRYSPONGE_CoreRound(ctx,0); + //need to fixup x such that all words are different + unsigned int modified=1; + while(modified){ + uint32_t c32[DRYSPONGE_CAPACITYSIZE32]; + memcpy(c32,ctx->c,DRYSPONGE_CAPACITYSIZE); + modified=0; + for(unsigned int i=0;ix,ctx->c,DRYSPONGE_XSIZE); + memcpy(ctx->c,key,DRYSPONGE_XSIZE); + } + uint32_t x32[DRYSPONGE_XSIZE32];// = (uint32_t *const)ctx->x; + memcpy(x32,ctx->x,DRYSPONGE_XSIZE); + //sanity check: all words in x shall be different + for(unsigned int i=0;i DRYSPONGE_BLOCKSIZE ? DRYSPONGE_BLOCKSIZE : remaining; + memcpy(out,ctx->r,len); + out+=len; + remaining-=len; + if(remaining){ + DRYSPONGE_g(ctx); + } + } +} + +static void DRYSPONGE_init_ctx( + DRYSPONGE_t *const ctx +){ + #ifdef DRYSPONGE_EXT + memset(DRYSPONGE_EXT_ARG,0,sizeof(DRYSPONGE_EXT_t)); + #endif + ctx->fcnt=0; + memset(ctx->r,0x00,DRYSPONGE_BLOCKSIZE); +} + +static void DRYSPONGE_hash( + const uint8_t *const message, + const size_t mlen, + uint8_t *const digest +){ + DRYSPONGE_t ctx_storage; + DRYSPONGE_t *const ctx = &ctx_storage; + DRYSPONGE_init_ctx(ctx); + ctx->rounds=DRYSPONGE_ROUNDS; + #if DRYSPONGE_DBG_EN + printf("Hashing %lu bytes message: ",mlen); + print_bytes_sep("",message,mlen,"\n",""); + #endif + const uint64_t CST_H[] = { + 0xd308a385886a3f24, + 0x447370032e8a1913, + 0xd0319f29223809a4, + 0x896c4eec98fa2e08, + 0x7713d038e6212845, + 0x6c0ce934cf6654be, + 0xdd507cc9b729acc0, + 0x170947b5b5d5843f, + 0x1bfb7989d9d51692, + 0xacb5df98a60b31d1, + 0xb7df1ad0db72fd2f, + 0x967e266aedafe1b8, + 0x997f2cf145907cba, + 0xf76c91b34799a124, + 0x16fc8e85e2f20108, + 0x694e5771d8206963, + }; + DRYSPONGE_set_key(ctx,(const uint8_t*)CST_H,DRYSPONGE_KEYSIZE+DRYSPONGE_XSIZE); + DRYSPONGE_absorb_only(ctx,message,mlen,DRYSPONGE_DS,1); + DRYSPONGE_squeez_only(ctx,digest,DRYSPONGE_DIGESTSIZE); + #if DRYSPONGE_DBG_EN + printf(" Final state:\n"); + DRYSPONGE_print_state(ctx); + print_bytes_sep(" Digest: ",digest,DRYSPONGE_DIGESTSIZE,"\n",""); + #endif +} + +static void DRYSPONGE_init( + DRYSPONGE_t *const ctx, + const uint8_t *const key, + const unsigned int klen, + const uint8_t *const nonce, + uint8_t *out_buffer,//output buffer + unsigned int finalize +){ + DRYSPONGE_init_ctx(ctx); + ctx->rounds=DRYSPONGE_ROUNDS; + DRYSPONGE_set_key(ctx,key,klen); + ctx->obuf = out_buffer; + DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(0,DRYSPONGE_DD,finalize)); + ctx->rounds=DRYSPONGE_INIT_ROUNDS; + #if DRYSPONGE_NONCESIZE>DRYSPONGE_BLOCKSIZE + assert(0==(DRYSPONGE_NONCESIZE%DRYSPONGE_BLOCKSIZE)); + unsigned int nloops = DRYSPONGE_DIVUP(DRYSPONGE_NONCESIZE,DRYSPONGE_BLOCKSIZE); + for(unsigned int i=0;irounds=DRYSPONGE_ROUNDS; +} + +static void DRYSPONGE_enc_core( + DRYSPONGE_t *const ctx, + const uint64_t *const ib//exactly one block of input +){ + + DRYSPONGE_xor((uint8_t *)ctx->r,(uint8_t *)ib,ctx->obuf); + DRYSPONGE_f(ctx,(uint8_t *)ib); + ctx->obuf+=DRYSPONGE_BLOCKSIZE; +} + +static void DRYSPONGE_enc_core_aligned( + DRYSPONGE_t *const ctx, + const uint64_t *const ib//exactly one block of input +){ + assert((((uintptr_t)ctx->obuf)%8) == 0); + DRYSPONGE_xor64(ctx->r,ib,(uint64_t*const)ctx->obuf); + DRYSPONGE_f(ctx,(uint8_t *)ib); + ctx->obuf+=DRYSPONGE_BLOCKSIZE; +} + +static const uint8_t* DRYSPONGE_enc_blocks( + DRYSPONGE_t *const ctx, + const uint8_t *im,//whole message + size_t m +){ + (void)DRYSPONGE_load32; + (void)DRYSPONGE_store32; + (void)DRYSPONGE_load64; + (void)DRYSPONGE_store64; + uint64_t buf64[DRYSPONGE_BLOCKSIZE64]; + const uint64_t *ib64; + #if DRYSPONGE_BLOCKSIZE % ALIGN64 + unsigned int input_aligned = 0; + unsigned int output_aligned = 0; + #else + unsigned int input_aligned = 0==(((uintptr_t)im)%ALIGN64); + unsigned int output_aligned = 0==(((uintptr_t)ctx->obuf)%ALIGN64); + #endif + if(input_aligned && output_aligned){ + for(size_t i = 0; ir,ib,ctx->obuf); + DRYSPONGE_f(ctx,ctx->obuf); + ctx->obuf+=DRYSPONGE_BLOCKSIZE; +} + +static void DRYSPONGE_dec_core_aligned( + DRYSPONGE_t *const ctx, + const uint64_t *const ib//exactly one block of input +){ + DRYSPONGE_xor64(ctx->r,ib,(uint64_t*const)ctx->obuf); + DRYSPONGE_f(ctx,ctx->obuf); + ctx->obuf+=DRYSPONGE_BLOCKSIZE; +} + +static const uint8_t* DRYSPONGE_dec_blocks( + DRYSPONGE_t *const ctx, + const uint8_t *im,//whole message + size_t m +){ + const uint64_t *ib64; + #if DRYSPONGE_BLOCKSIZE % ALIGN64 + unsigned int input_aligned = 0; + unsigned int output_aligned = 0; + #else + unsigned int input_aligned = 0==(((uintptr_t)im)%ALIGN64); + unsigned int output_aligned = 0==(((uintptr_t)ctx->obuf)%ALIGN64); + #endif + if(input_aligned && output_aligned){ + for(size_t i = 0; iobuf = ciphertext + mlen;//fix the size + } + DRYSPONGE_squeez_only(ctx,ctx->obuf,DRYSPONGE_TAGSIZE); + *clen = mlen+DRYSPONGE_TAGSIZE; + #if DRYSPONGE_DBG_EN + printf(" Final state:\n"); + DRYSPONGE_print_state(ctx); + print_bytes_sep(" CipherText: ",ciphertext,*clen,"\n",""); + #endif +} + +//WARNING the function writes plaintext into "message" before checking the tag. +//It is the responsability of the caller to ensure that the "message" buffer is +//not accessible by anything until this function has return. +static int DRYSPONGE_dec( + const uint8_t *const key, + const unsigned int klen, + const uint8_t *const nonce, + const uint8_t *const ciphertext, + const size_t clen, + const uint8_t * const ad, + const size_t alen, + uint8_t *message +){ + if(clenr,last_block64,last_block64); + uint8_t mpad = DRYSPONGE_padding(last_block,remaining,last_block); + im+=remaining; + DRYSPONGE_DomainSeparator(DRYSPONGE_EXT_ARG,DRYSPONGE_DSINFO(mpad,DRYSPONGE_DM,1)); + memcpy(ctx->obuf,last_block,remaining); + DRYSPONGE_f(ctx,last_block); + } + uint64_t tag64[DRYSPONGE_TAGSIZE64]; + uint8_t*tag = (uint8_t*)tag64; + DRYSPONGE_squeez_only(ctx,tag,DRYSPONGE_TAGSIZE); + DRYSPONGE_DBG(print_bytes_sep("expected tag=",im,DRYSPONGE_TAGSIZE,"\n","")); + DRYSPONGE_DBG(print_bytes_sep("computed tag=",tag,DRYSPONGE_TAGSIZE,"\n","")); + if(memcmp(tag,im,DRYSPONGE_TAGSIZE)){ + memset(message,0,mlen);//erase all output + return ~DRYSPONGE_PASS; + } + return DRYSPONGE_PASS; +} +#endif diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/encrypt.c b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/encrypt.c new file mode 100644 index 0000000..c42ca12 --- /dev/null +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/encrypt.c @@ -0,0 +1,52 @@ +#include "crypto_aead.h" +#define DRYSPONGE_OPT_G drygascon128_g +#define DRYSPONGE_OPT_F drygascon128_f +#include "drysponge.h" + +/** +generating a ciphertext c[0],c[1],...,c[*clen-1] +from a plaintext m[0],m[1],...,m[mlen-1] +and associated data ad[0],ad[1],...,ad[adlen-1] +and nonce npub[0],npub[1],... +and secret key k[0],k[1],... +the implementation shall not use nsec +*/ +int crypto_aead_encrypt( + unsigned char *c,unsigned long long *clen, + const unsigned char *m,unsigned long long mlen, + const unsigned char *ad,unsigned long long adlen, + const unsigned char *nsec, + const unsigned char *npub, + const unsigned char *k +){ + (void) nsec; //avoid warning + (void) DRYSPONGE_hash; //avoid warning + size_t impl_clen; + DRYSPONGE_enc(k,DRYSPONGE_KEYSIZE,npub,m,mlen,ad,adlen,c,&impl_clen); + *clen = impl_clen; + return 0; +} + +/** +the code for the AEAD implementation goes here, +generating a plaintext m[0],m[1],...,m[*mlen-1] +and secret message number nsec[0],nsec[1],... +from a ciphertext c[0],c[1],...,c[clen-1] +and associated data ad[0],ad[1],...,ad[adlen-1] +and nonce number npub[0],npub[1],... +and secret key k[0],k[1],... +*/ +int crypto_aead_decrypt( + unsigned char *m,unsigned long long *mlen, + unsigned char *nsec, + const unsigned char *c,unsigned long long clen, + const unsigned char *ad,unsigned long long adlen, + const unsigned char *npub, + const unsigned char *k +){ + (void) nsec; //avoid warning + if(DRYSPONGE_PASS!=DRYSPONGE_dec(k,DRYSPONGE_KEYSIZE,npub,c,clen,ad,adlen,m)) + return -1; + *mlen = clen - DRYSPONGE_TAGSIZE; + return 0; +} diff --git a/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/nistlwc b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/nistlwc new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/drygascon/Implementations/crypto_aead/drygascon128/add_arm-cm0/nistlwc diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/api.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/api.h new file mode 100644 index 0000000..fb1d58b --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/api.h @@ -0,0 +1,5 @@ +#define CRYPTO_KEYBYTES 16 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 16 +#define CRYPTO_ABYTES 16 +#define CRYPTO_NOOVERLAP 1 diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/cofb.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/cofb.h new file mode 100644 index 0000000..143c7d3 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/cofb.h @@ -0,0 +1,20 @@ +#ifndef GIFT_COFB_H_ +#define GIFT_COFB_H_ + +#define TAG_SIZE 16 +#define COFB_ENCRYPT 1 +#define COFB_DECRYPT 0 + +#define XOR_BLOCK(x, y, z) ({ \ + (x)[0] = (y)[0] ^ (z)[0]; \ + (x)[1] = (y)[1] ^ (z)[1]; \ + (x)[2] = (y)[2] ^ (z)[2]; \ + (x)[3] = (y)[3] ^ (z)[3]; \ +}) + +#define XOR_TOP_BAR_BLOCK(x, y) ({ \ + (x)[0] ^= (y)[0]; \ + (x)[1] ^= (y)[1]; \ +}) + +#endif // GIFT_COFB_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/encrypt.c b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/encrypt.c new file mode 100644 index 0000000..518c555 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/encrypt.c @@ -0,0 +1,191 @@ +#include +#include "api.h" +#include "cofb.h" +#include "giftb128.h" + +static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){ + u32 i; + if (no_of_bytes == 0) { + d[0] = 0x00000080; // little-endian + d[1] = 0x00000000; + d[2] = 0x00000000; + d[3] = 0x00000000; + } + else if (no_of_bytes < GIFT128_BLOCK_SIZE) { + for (i = 0; i < no_of_bytes/4+1; i++) + d[i] = s[i]; + d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8); + d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8; + for (; i < 4; i++) + d[i] = 0x00000000; + } + else { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + } +} + +static inline void double_half_block(u32* x) { + u32 tmp0; + tmp0 = (x)[0]; + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); + (x)[0] |= ((x)[1] & 0x80808080) << 17; + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; +} + +static inline void triple_half_block(u32* x) { + u32 tmp0, tmp1; + tmp0 = (x)[0]; + tmp1 = (x)[1]; + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); + (x)[0] |= ((x)[1] & 0x80808080) << 17; + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; + (x)[0] ^= tmp0; + (x)[1] ^= tmp1; +} + +static inline void g(u32 *x) { + u32 tmp0, tmp1; + tmp0 = (x)[0]; + tmp1 = (x)[1]; + (x)[0] = (x)[2]; + (x)[1] = (x)[3]; + (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15); + (x)[2] |= ((tmp1 & 0x80808080) << 17); + (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15); + (x)[3] |= ((tmp0 & 0x80808080) << 17); +} + +static inline void rho1(u32* d, u32* y, u32* m, u32 n) { + g(y); + padding(d,m,n); + XOR_BLOCK(d, d, y); +} + +static inline void rho(u32* y, u32* m, u32* x, u32* c, u32 n) { + XOR_BLOCK(c, y, m); + rho1(x, y, m, n); +} + +static inline void rho_prime(u32* y, u32*c, u32* x, u32* m, u32 n) { + XOR_BLOCK(m, y, c); + rho1(x, y, m, n); +} + +/**************************************************************************** +* Constant-time implementation of the GIFT-COFB authenticated cipher based on +* fixsliced GIFTb-128. Encryption/decryption is handled by the same function, +* depending on the 'mode' parameter (1/0). +****************************************************************************/ +int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad, + u32 ad_len, const u8* in, u32 in_len, const int encrypting) { + + u32 tmp0, tmp1, emptyA, emptyM, offset[2]; + u32 input[4], rkey[80]; + u8 Y[GIFT128_BLOCK_SIZE]; + + if (!encrypting) { + if (in_len < TAG_SIZE) + return -1; + in_len -= TAG_SIZE; + } + + if(ad_len == 0) + emptyA = 1; + else + emptyA = 0; + + if(in_len == 0) + emptyM =1; + else + emptyM = 0; + + gift128_keyschedule(key, rkey); + giftb128_encrypt_block(Y, rkey, nonce); + offset[0] = ((u32*)Y)[0]; + offset[1] = ((u32*)Y)[1]; + + while(ad_len > GIFT128_BLOCK_SIZE){ + rho1(input, (u32*)Y, (u32*)ad, GIFT128_BLOCK_SIZE); + double_half_block(offset); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + ad += GIFT128_BLOCK_SIZE; + ad_len -= GIFT128_BLOCK_SIZE; + } + + triple_half_block(offset); + if((ad_len % GIFT128_BLOCK_SIZE != 0) || (emptyA)) + triple_half_block(offset); + if(emptyM) { + triple_half_block(offset); + triple_half_block(offset); + } + + rho1(input, (u32*)Y, (u32*)ad, ad_len); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + + while (in_len > GIFT128_BLOCK_SIZE){ + double_half_block(offset); + if (encrypting) + rho((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE); + else + rho_prime((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + in += GIFT128_BLOCK_SIZE; + out += GIFT128_BLOCK_SIZE; + in_len -= GIFT128_BLOCK_SIZE; + } + + if(!emptyM){ + triple_half_block(offset); + if(in_len % GIFT128_BLOCK_SIZE != 0) + triple_half_block(offset); + if (encrypting) { + rho((u32*)Y, (u32*)in, input, (u32*)out, in_len); + out += in_len; + } + else { + rho_prime((u32*)Y, (u32*)in, input, (u32*)out, in_len); + in += in_len; + } + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + } + + if (encrypting) { // encryption mode + memcpy(out, Y, TAG_SIZE); + return 0; + } + // decrypting + tmp0 = 0; + for(tmp1 = 0; tmp1 < TAG_SIZE; tmp1++) + tmp0 |= in[tmp1] ^ Y[tmp1]; + return tmp0; +} + +int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen, + const unsigned char* m, unsigned long long mlen, + const unsigned char* ad, unsigned long long adlen, + const unsigned char* nsec, const unsigned char* npub, + const unsigned char* k) { + (void)nsec; + *clen = mlen + TAG_SIZE; + return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT); +} + +int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen, + unsigned char* nsec, const unsigned char* c, + unsigned long long clen, const unsigned char* ad, + unsigned long long adlen, const unsigned char* npub, + const unsigned char *k) { + (void)nsec; + *mlen = clen - TAG_SIZE; + return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT); +} \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.h new file mode 100644 index 0000000..bcb4f36 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.h @@ -0,0 +1,13 @@ +#ifndef GIFT128_H_ +#define GIFT128_H_ + +#define KEY_SIZE 16 +#define GIFT128_BLOCK_SIZE 16 + +typedef unsigned char u8; +typedef unsigned int u32; + +extern void gift128_keyschedule(const u8* key, u32* rkey); +extern void giftb128_encrypt_block(u8* out_block, const u32* rkey, const u8* in_block); + +#endif // GIFT128_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.s b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.s new file mode 100644 index 0000000..5e2b48f --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_balanced/giftb128.s @@ -0,0 +1,768 @@ +/**************************************************************************** +* Balanced ARM assembly implementation of the GIFT-128 block cipher. This +* implementation provides efficiency with limited impact on the code size. +* See "Fixslicing: A New GIFT Representation" paper available at +* https:// for more details. +****************************************************************************/ + +.syntax unified +.thumb + +/***************************************************************************** +* Round constants look-up table according to the fixsliced representation. +*****************************************************************************/ +.align 2 +.type rconst,%object +rconst: +.word 0x10000008, 0x80018000, 0x54000002, 0x01010181 +.word 0x8000001f, 0x10888880, 0x6001e000, 0x51500002 +.word 0x03030180, 0x8000002f, 0x10088880, 0x60016000 +.word 0x41500002, 0x03030080, 0x80000027, 0x10008880 +.word 0x4001e000, 0x11500002, 0x03020180, 0x8000002b +.word 0x10080880, 0x60014000, 0x01400002, 0x02020080 +.word 0x80000021, 0x10000080, 0x0001c000, 0x51000002 +.word 0x03010180, 0x8000002e, 0x10088800, 0x60012000 +.word 0x40500002, 0x01030080, 0x80000006, 0x10008808 +.word 0xc001a000, 0x14500002, 0x01020181, 0x8000001a + +.align 2 +classical_key_update: + and r2, r10, r7, lsr #12 + and r3, r7, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r7, lsr #2 + orr r2, r2, r3 + and r7, r7, #0x00030000 + orr r7, r2, r7, lsl #14 + str.w r7, [r1, #4] //1st classical key update + str.w r5, [r1], #8 //1st classical key update + and r2, r10, r6, lsr #12 + and r3, r6, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r6, lsr #2 + orr r2, r2, r3 + and r6, r6, #0x00030000 + orr r6, r2, r6, lsl #14 + str.w r6, [r1, #4] //2nd classical key update + str.w r4, [r1], #8 //2nd classical key update + and r2, r10, r5, lsr #12 + and r3, r5, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r5, lsr #2 + orr r2, r2, r3 + and r5, r5, #0x00030000 + orr r5, r2, r5, lsl #14 + str.w r5, [r1, #4] //3rd classical key update + str.w r7, [r1], #8 //3rd classical key update + and r2, r10, r4, lsr #12 + and r3, r4, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r4, lsr #2 + orr r2, r2, r3 + and r4, r4, #0x00030000 + orr r4, r2, r4, lsl #14 + str.w r4, [r1, #4] //4th classical key update + str.w r6, [r1], #8 //4th classical key update + bx lr + +.align 2 +rearrange_rkey_0: + ldrd r6, r4, [r1] + eor r12, r6, r6, lsr #9 + and r12, r12, r3 + eor r6, r12 + eor r6, r6, r12, lsl #9 //SWAPMOVE(r6, r6, 0x00550055, 9); + eor r12, r4, r4, lsr #9 + and r12, r12, r3 + eor r4, r12 + eor r4, r4, r12, lsl #9 //SWAPMOVE(r4, r4, 0x00550055, 9); + eor r12, r6, r6, lsr #18 + and r12, r12, r10 + eor r6, r12 + eor r6, r6, r12, lsl #18 //SWAPMOVE(r6, r6, 0x3333, 18); + eor r12, r4, r4, lsr #18 + and r12, r12, r10 + eor r4, r12 + eor r4, r4, r12, lsl #18 //SWAPMOVE(r4, r4, 0x3333, 18); + eor r12, r6, r6, lsr #12 + and r12, r12, r11 + eor r6, r12 + eor r6, r6, r12, lsl #12 //SWAPMOVE(r6, r6, 0x000f000f, 12); + eor r12, r4, r4, lsr #12 + and r12, r12, r11 + eor r4, r12 + eor r4, r4, r12, lsl #12 //SWAPMOVE(r4, r4, 0x000f000f, 12); + eor r12, r6, r6, lsr #24 + and r12, r12, #0xff + eor r6, r12 + eor r6, r6, r12, lsl #24 //SWAPMOVE(r6, r6, 0x000000ff, 24); + eor r12, r4, r4, lsr #24 + and r12, r12, #0xff + eor r4, r12 + eor r4, r4, r12, lsl #24 //SWAPMOVE(r4, r4, 0x000000ff, 24); + str.w r6, [r1] + str.w r4, [r1, #4] + bx lr + +.align 2 +rearrange_rkey_1: + ldrd r5, r7, [r1] + eor r8, r7, r7, lsr #3 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #3 //SWAPMOVE(r7, r7, 0x11111111, 3); + eor r8, r5, r5, lsr #3 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #3 //SWAPMOVE(r5, r5, 0x11111111, 3); + eor r8, r7, r7, lsr #6 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #6 //SWAPMOVE(r7, r7, 0x03030303, 6); + eor r8, r5, r5, lsr #6 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #6 //SWAPMOVE(r5, r5, 0x03030303, 6); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + str.w r5, [r1] + str.w r7, [r1, #4] + bx lr + +.align 2 +rearrange_rkey_2: + ldrd r5, r7, [r1] + eor r8, r7, r7, lsr #15 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #15 //SWAPMOVE(r7, r7, 0x0000aaaa, 15); + eor r8, r5, r5, lsr #15 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #15 //SWAPMOVE(r5, r5, 0x0000aaaa, 15); + eor r8, r7, r7, lsr #18 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #18 //SWAPMOVE(r7, r7, 0x00003333, 18); + eor r8, r5, r5, lsr #18 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #18 //SWAPMOVE(r5, r5, 0x00003333, 18); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x00000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + str.w r5, [r1] + str.w r7, [r1, #4] + bx lr + +.align 2 +rearrange_rkey_3: + ldrd r5, r7, [r1] + eor r8, r7, r7, lsr #3 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #3 //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3); + eor r8, r5, r5, lsr #3 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3); + eor r8, r7, r7, lsr #6 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #6 //SWAPMOVE(r7, r7, 0x00cc00cc, 6); + eor r8, r5, r5, lsr #6 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #6 //SWAPMOVE(r5, r5, 0x00cc00cc, 6); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + str.w r5, [r1] + str.w r7, [r1, #4] + bx lr + +.align 2 +key_update_0: + ldrd r4, r5, [r1], #80 + and r2, r12, r4, ror #24 + and r4, r4, r11 + orr r4, r2, r4, ror #16 //KEY_TRIPLE_UPDATE_1(r4) + eor r2, r4, r4, lsr #1 + and r2, r2, r8 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x55551100, 1) + eor r2, r5, r5, lsr #16 + and r2, r2, r10 + eor r5, r5, r2 + eor r5, r5, r2, lsl #16 //SWAPMOVE(r5, r5, 0x00003333, 16) + eor r2, r5, r5, lsr #1 + and r2, r2, r9 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x555544444, 1) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + and r2, r12, r5, ror #24 + and r5, r5, r11 + orr r5, r2, r5, ror #16 //KEY_TRIPLE_UPDATE_1(r5) + eor r2, r5, r5, lsr #1 + and r2, r2, r8 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x55551100, 1) + eor r2, r4, r4, lsr #16 + and r2, r2, r10 + eor r4, r4, r2 + eor r4, r4, r2, lsl #16 //SWAPMOVE(r4, r4, 0x00003333, 16) + eor r2, r4, r4, lsr #1 + and r2, r2, r9 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x555544444, 1) + str.w r5, [r1, #4] + str.w r4, [r1], #80 + and r2, r12, r4, ror #24 + and r4, r4, r11 + orr r4, r2, r4, ror #16 //KEY_TRIPLE_UPDATE_1(r4) + eor r2, r4, r4, lsr #1 + and r2, r2, r8 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x55551100, 1) + eor r2, r5, r5, lsr #16 + and r2, r2, r10 + eor r5, r5, r2 + eor r5, r5, r2, lsl #16 //SWAPMOVE(r5, r5, 0x00003333, 16) + eor r2, r5, r5, lsr #1 + and r2, r2, r9 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x555544444, 1) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + bx lr + +.align 2 +key_update_1: + ldrd r4, r5, [r1], #80 + and r2, r9, r4, lsr #6 + and r3, r4, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #5 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r12, r5, lsr #4 + and r3, r5, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r5, lsr #6 + orr r2, r2, r3 + and r5, r5, r10 + orr r5, r2, r5, lsl #2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + and r2, r9, r5, lsr #6 + and r3, r5, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #5 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_TRIPLE_UPDATE_2(r5) + and r2, r12, r4, lsr #4 + and r3, r4, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r4, lsr #6 + orr r2, r2, r3 + and r4, r4, r10 + orr r4, r2, r4, lsl #2 //KEY_DOUBLE_UPDATE_2(r4) + str.w r5, [r1, #4] + str.w r4, [r1], #80 + and r2, r9, r4, lsr #6 + and r3, r4, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #5 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r12, r5, lsr #4 + and r3, r5, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r5, lsr #6 + orr r2, r2, r3 + and r5, r5, r10 + orr r5, r2, r5, lsl#2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + bx lr + +.align 2 +key_update_2: + ldrd r4, r5, [r1], #80 + and r2, r12, r4, ror #24 + and r4, r11, r4, ror #20 + orr r4, r4, r2 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r11, r5, ror #24 + and r5, r12, r5, ror #16 + orr r5, r5, r2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + and r2, r12, r5, ror #24 + and r5, r11, r5, ror #20 + orr r5, r5, r2 //KEY_TRIPLE_UPDATE_2(r5) + and r2, r11, r4, ror #24 + and r4, r12, r4, ror #16 + orr r4, r4, r2 //KEY_DOUBLE_UPDATE_2(r4) + str.w r5, [r1, #4] + str.w r4, [r1], #80 + and r2, r12, r4, ror #24 + and r4, r11, r4, ror #20 + orr r4, r4, r2 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r11, r5, ror #24 + and r5, r12, r5, ror #16 + orr r5, r5, r2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + bx lr + +.align 2 +key_update_3: + ldrd r4, r5, [r1], #80 + and r2, r10, r4, lsr #18 + and r3, r4, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r4, lsr #14 + orr r2, r2, r3 + and r3, r4, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7, lsr #16 + orr r4, r2, r4, lsl #19 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r9, r5, lsr #2 + and r3, r9, r5 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_DOUBLE_UPDATE_4(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + and r2, r10, r5, lsr #18 + and r3, r5, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r5, lsr #14 + orr r2, r2, r3 + and r3, r5, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7, lsr #16 + orr r5, r2, r5, lsl #19 //KEY_TRIPLE_UPDATE_4(r5) + and r2, r9, r4, lsr #2 + and r3, r9, r4 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_DOUBLE_UPDATE_4(r4) + str.w r5, [r1, #4] + str.w r4, [r1], #80 + and r2, r10, r4, lsr #18 + and r3, r4, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r4, lsr #14 + orr r2, r2, r3 + and r3, r4, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7, lsr #16 + orr r4, r2, r4, lsl #19 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r9, r5, lsr #2 + and r3, r9, r5 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_DOUBLE_UPDATE_4(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + bx lr + +.align 2 +key_update_4: + ldrd r4, r5, [r1], #80 + and r2, r7, r4, lsr #6 + and r3, r4, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r4, lsr #4 + orr r2, r2, r3 + and r4, r4, #0x000f + orr r4, r2, r4, lsl #12 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r10, r5, lsr #4 + and r3, r5, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r5, lsr #8 + orr r2, r2, r3 + and r5, r5, r8 + orr r5, r2, r5, lsl #8 //KEY_DOUBLE_UPDATE_4(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + and r2, r7, r5, lsr #6 + and r3, r5, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r5, lsr #4 + orr r2, r2, r3 + and r5, r5, #0x000f + orr r5, r2, r5, lsl #12 //KEY_TRIPLE_UPDATE_4(r5) + and r2, r10, r4, lsr #4 + and r3, r4, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r4, lsr #8 + orr r2, r2, r3 + and r4, r4, r8 + orr r4, r2, r4, lsl #8 //KEY_DOUBLE_UPDATE_4(r4) + str.w r5, [r1, #4] + str.w r4, [r1], #80 + and r2, r7, r4, lsr #6 + and r3, r4, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r4, lsr #4 + orr r2, r2, r3 + and r4, r4, #0x000f + orr r4, r2, r4, lsl #12 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r10, r5, lsr #4 + and r3, r5, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r5, lsr #8 + orr r2, r2, r3 + and r5, r5, r8 + orr r5, r2, r5, lsl #8 //KEY_DOUBLE_UPDATE_4(r5) + str.w r4, [r1, #4] + str.w r5, [r1], #80 + bx lr + +/***************************************************************************** +* Balanced implementation of the GIFT-128 key schedule according to the +* fixsliced representation. +*****************************************************************************/ +.align 2 +@ void gift128_keyschedule(const u8* key, u32* rkey) { +.global gift128_keyschedule +.type gift128_keyschedule,%function +gift128_keyschedule: + push {r1-r12, r14} + ldm r0, {r4-r7} //load key words + rev r4, r4 //endianness (could be skipped with another representation) + rev r5, r5 //endianness (could be skipped with another representation) + rev r6, r6 //endianness (could be skipped with another representation) + rev r7, r7 //endianness (could be skipped with another representation) + str.w r5, [r1, #4] + str.w r7, [r1], #8 //the first rkeys are not updated + str.w r4, [r1, #4] + str.w r6, [r1], #8 //the first rkeys are not updated + movw r12, #0x3fff + lsl r12, r12, #16 //r12<- 0x3fff0000 + movw r10, #0x000f //r10<- 0x0000000f + movw r9, #0x0fff //r9 <- 0x00000fff + bl classical_key_update //keyschedule using classical representation (10 rounds) + bl classical_key_update //keyschedule using classical representation (20 rounds) + sub.w r1, r1, #80 + movw r3, #0x0055 + movt r3, #0x0055 //r3 <- 0x00550055 + movw r10, #0x3333 //r10<- 0x00003333 + movw r11, #0x000f + movt r11, #0x000f //r11<- 0x000f000f + bl rearrange_rkey_0 //fixslice the rkeys + add.w r1, r1, #40 + bl rearrange_rkey_0 //fixslice the rkeys + sub.w r1, r1, #32 + movw r3, #0x1111 + movt r3, #0x1111 //r3 <- 0x11111111 + movw r10, #0x0303 + movt r10, #0x0303 //r10<- 0x03030303 + bl rearrange_rkey_1 //fixslice the rkeys + add.w r1, r1, #40 + bl rearrange_rkey_1 //fixslice the rkeys + sub.w r1, r1, #32 + movw r3, #0xaaaa //r3 <- 0x0000aaaa + movw r10, #0x3333 //r10<- 0x00003333 + movw r11, #0xf0f0 //r11<- 0x0000f0f0 + bl rearrange_rkey_2 //fixslice the rkeys + add.w r1, r1, #40 + bl rearrange_rkey_2 //fixslice the rkeys + sub.w r1, r1, #32 + movw r3, #0x0a0a + movt r3, #0x0a0a //r3 <- 0x0a0a0a0a + movw r10, #0x00cc + movt r10, #0x00cc //r10<- 0x00cc00cc + bl rearrange_rkey_3 //fixslice the rkeys + add.w r1, r1, #40 + bl rearrange_rkey_3 //fixslice the rkeys + sub.w r1, r1, #64 + movw r10, #0x3333 //r10<- 0x00003333 + eor r12, r10, r10, lsl #16 //r12<- 0w33333333 + mvn r11, r12 //r11<- 0xcccccccc + movw r9, #0x4444 + movt r9, #0x5555 //r9 <- 0x55554444 + movw r8, #0x1100 + movt r8, #0x5555 //r8 <- 0x55551100 + bl key_update_0 //keyschedule according to fixslicing + sub.w r1, r1, #280 + bl key_update_0 //keyschedule according to fixslicing + sub.w r1, r1, #352 + movw r12, #0x0f00 + movt r12, #0x0f00 //r12<- 0x0f000f00 + movw r11, #0x0003 + movt r11, #0x0003 //r11<- 0x00030003 + movw r10, #0x003f + movt r10, #0x003f //r10<- 0x003f003f + lsl r9, r11, #8 //r9 <- 0x03000300 + and r8, r10, r10, lsr #3 //r8 <- 0x00070007 + orr r7, r8, r8, lsl #2 //r7 <- 0x001f001f + bl key_update_1 //keyschedule according to fixslicing + sub.w r1, r1, #280 + bl key_update_1 //keyschedule according to fixslicing + sub.w r1, r1, #352 + movw r12, #0x5555 + movt r12, #0x5555 //r12<- 0x55555555 + mvn r11, r12 //r11<- 0xaaaaaaaa + bl key_update_2 //keyschedule according to fixslicing + sub.w r1, r1, #280 + bl key_update_2 //keyschedule according to fixslicing + sub.w r1, r1, #352 + orr r12, r8, r8, lsl #8 //r12<- 0x07070707 + movw r11, #0xc0c0 //r11<- 0x0000c0c0 + movw r10, #0x3030 //r10<- 0x00003030 + and r9, r12, r12, lsr #1 //r9 <- 0x03030303 + lsl r8, r12, #4 //r8 <- 0x70707070 + eor r7, r8, r9, lsl #5 //r7 <- 0x10101010 + movw r6, #0xf0f0 //r6 <- 0x0000f0f0 + bl key_update_3 //keyschedule according to fixslicing + sub.w r1, r1, #280 + bl key_update_3 //keyschedule according to fixslicing + sub.w r1, r1, #352 + movw r12, #0x0fff + lsl r10, r12, #16 + movw r8, #0x00ff //r8 <- 0x000000ff + movw r7, #0x03ff //r7 <- 0x000003ff + lsl r7, r7, #16 + bl key_update_4 //keyschedule according to fixslicing + sub.w r1, r1, #280 + bl key_update_4 //keyschedule according to fixslicing + pop {r1-r12,r14} + bx lr + +.align 2 +quintuple_round: + str.w r14, [sp] + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r11, r9 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + and r8, r4, r12, lsr #1 //permutation layer + and r12, r12, r2 + orr r12, r8, r12, lsl #3 //r12<- NIBBLE_ROR(r12, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //r11<- NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //r14 <- 0x33333333 + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //r10<- NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + mvn r14, r3, lsl #12 //r0 <- 0x0fff0fff + and r8, r14, r9, lsr #4 + and r9, r9, r3 + orr r9, r8, r9, lsl #12 //r9 <- HALF_ROR(r9, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //r11<- HALF_ROR(r11, 12) + rev16 r10, r10 //r10<- HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + orr r14, r2, r2, lsl #2 //r14 <- 0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r12, r12, lsr #1 + and r8, r8, r14, lsr #16 + eor r12, r12, r8 + eor r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x55550000, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x00005555, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r9, r9, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r11, r12, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #16 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r14, r3, r3, lsl #8 //r14 <- 0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //r10<- BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //r14 <- 0x3f3f3f3f for BYTE_ROR + mvn r8, r14 //r8 <- 0xc0c0c0c0 for BYTE_ROR + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //r11<- BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r9, lsr #6 + and r9, r14, r9 + orr r9, r8, r9, lsl #2 //r9 <- BYTE_ROR(r9, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + ldr.w lr, [sp] //restore link register + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12, ror #24 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r9, r9, r5 //add rconst + eor r9, r9, r12 //swap r9 with r12 + eor r12, r12, r9 //swap r9 with r12 + eor r9, r9, r12 //swap r9 with r12 + bx lr + +/***************************************************************************** +* Balanced ARM assembly implementation of the GIFTb-128 block cipher. +* This function simply encrypts a 128-bit block, without any operation mode. +*****************************************************************************/ +.align 2 +@ void giftb128_encrypt_block(u8 *out, const u32* rkey, const u8 *block) +.global giftb128_encrypt_block +.type giftb128_encrypt_block,%function +giftb128_encrypt_block: + push {r0,r2-r12,r14} + sub.w sp, #4 //to store 'lr' when calling 'quintuple_round' + ldm r2, {r9-r12} // load plaintext words + rev r9, r9 + rev r10, r10 + rev r11, r11 + rev r12, r12 + movw r2, #0x1111 + movt r2, #0x1111 //r2 <- 0x11111111 (for NIBBLE_ROR) + movw r3, #0x000f + movt r3, #0x000f //r3 <- 0x000f000f (for HALF_ROR) + mvn r4, r2, lsl #3 //r4 <- 0x7777777 (for NIBBLE_ROR) + adr r0, rconst //r0 <- 'rconst' address + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + ldr.w r0, [sp ,#4] //restore 'ctext' address + rev r9, r9 + rev r10, r10 + rev r11, r11 + rev r12, r12 + stm r0, {r9-r12} + add.w sp, #4 + pop {r0,r2-r12,r14} + bx lr + \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/api.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/api.h new file mode 100644 index 0000000..fb1d58b --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/api.h @@ -0,0 +1,5 @@ +#define CRYPTO_KEYBYTES 16 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 16 +#define CRYPTO_ABYTES 16 +#define CRYPTO_NOOVERLAP 1 diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/cofb.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/cofb.h new file mode 100644 index 0000000..143c7d3 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/cofb.h @@ -0,0 +1,20 @@ +#ifndef GIFT_COFB_H_ +#define GIFT_COFB_H_ + +#define TAG_SIZE 16 +#define COFB_ENCRYPT 1 +#define COFB_DECRYPT 0 + +#define XOR_BLOCK(x, y, z) ({ \ + (x)[0] = (y)[0] ^ (z)[0]; \ + (x)[1] = (y)[1] ^ (z)[1]; \ + (x)[2] = (y)[2] ^ (z)[2]; \ + (x)[3] = (y)[3] ^ (z)[3]; \ +}) + +#define XOR_TOP_BAR_BLOCK(x, y) ({ \ + (x)[0] ^= (y)[0]; \ + (x)[1] ^= (y)[1]; \ +}) + +#endif // GIFT_COFB_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/encrypt.c b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/encrypt.c new file mode 100644 index 0000000..6c65b58 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/encrypt.c @@ -0,0 +1,191 @@ +#include +#include "api.h" +#include "cofb.h" +#include "giftb128.h" + +static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){ + u32 i; + if (no_of_bytes == 0) { + d[0] = 0x00000080; // little-endian + d[1] = 0x00000000; + d[2] = 0x00000000; + d[3] = 0x00000000; + } + else if (no_of_bytes < GIFT128_BLOCK_SIZE) { + for (i = 0; i < no_of_bytes/4+1; i++) + d[i] = s[i]; + d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8); + d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8; + for (; i < 4; i++) + d[i] = 0x00000000; + } + else { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + } +} + +static inline void double_half_block(u32* x) { + u32 tmp0; + tmp0 = (x)[0]; + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); + (x)[0] |= ((x)[1] & 0x80808080) << 17; + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; +} + +static inline void triple_half_block(u32* x) { + u32 tmp0, tmp1; + tmp0 = (x)[0]; + tmp1 = (x)[1]; + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); + (x)[0] |= ((x)[1] & 0x80808080) << 17; + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; + (x)[0] ^= tmp0; + (x)[1] ^= tmp1; +} + +static inline void g(u32 *x) { + u32 tmp0, tmp1; + tmp0 = (x)[0]; + tmp1 = (x)[1]; + (x)[0] = (x)[2]; + (x)[1] = (x)[3]; + (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15); + (x)[2] |= ((tmp1 & 0x80808080) << 17); + (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15); + (x)[3] |= ((tmp0 & 0x80808080) << 17); +} + +static inline void rho1(u32* d, u32* y, u32* m, u32 n) { + g(y); + padding(d,m,n); + XOR_BLOCK(d, d, y); +} + +static inline void rho(u32* y, u32* m, u32* x, u32* c, u32 n) { + XOR_BLOCK(c, y, m); + rho1(x, y, m, n); +} + +static inline void rho_prime(u32* y, u32*c, u32* x, u32* m, u32 n) { + XOR_BLOCK(m, y, c); + rho1(x, y, m, n); +} + +/**************************************************************************** +* Constant-time implementation of the GIFT-COFB authenticated cipher based on +* fixsliced GIFTb-128. Encryption/decryption is handled by the same function, +* depending on the 'mode' parameter (1/0). +****************************************************************************/ +int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad, + u32 ad_len, const u8* in, u32 in_len, const int encrypting) { + + u32 tmp0, tmp1, emptyA, emptyM, offset[2]; + u32 input[4], rkey[80]; + u8 Y[GIFT128_BLOCK_SIZE]; + + if (!encrypting) { + if (in_len < TAG_SIZE) + return -1; + in_len -= TAG_SIZE; + } + + if(ad_len == 0) + emptyA = 1; + else + emptyA = 0; + + if(in_len == 0) + emptyM =1; + else + emptyM = 0; + + gift128_keyschedule(key, rkey); + giftb128_encrypt_block(Y, rkey, nonce); + offset[0] = ((u32*)Y)[0]; + offset[1] = ((u32*)Y)[1]; + + while(ad_len > GIFT128_BLOCK_SIZE){ + rho1(input, (u32*)Y, (u32*)ad, GIFT128_BLOCK_SIZE); + double_half_block(offset); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + ad += GIFT128_BLOCK_SIZE; + ad_len -= GIFT128_BLOCK_SIZE; + } + + triple_half_block(offset); + if((ad_len % GIFT128_BLOCK_SIZE != 0) || (emptyA)) + triple_half_block(offset); + if(emptyM) { + triple_half_block(offset); + triple_half_block(offset); + } + + rho1(input, (u32*)Y, (u32*)ad, ad_len); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + + while (in_len > GIFT128_BLOCK_SIZE){ + double_half_block(offset); + if (encrypting) + rho((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE); + else + rho_prime((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + in += GIFT128_BLOCK_SIZE; + out += GIFT128_BLOCK_SIZE; + in_len -= GIFT128_BLOCK_SIZE; + } + + if(!emptyM){ + triple_half_block(offset); + if(in_len % GIFT128_BLOCK_SIZE != 0) + triple_half_block(offset); + if (encrypting) { + rho((u32*)Y, (u32*)in, input, (u32*)out, in_len); + out += in_len; + } + else { + rho_prime((u32*)Y, (u32*)in, input, (u32*)out, in_len); + in += in_len; + } + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + } + + if (encrypting) { // encryption mode + memcpy(out, Y, TAG_SIZE); + return 0; + } + // decrypting + tmp0 = 0; + for(tmp1 = 0; tmp1 < TAG_SIZE; tmp1++) + tmp0 |= in[tmp1] ^ Y[tmp1]; + return tmp0; +} + +int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen, + const unsigned char* m, unsigned long long mlen, + const unsigned char* ad, unsigned long long adlen, + const unsigned char* nsec, const unsigned char* npub, + const unsigned char* k) { + (void)nsec; + *clen = mlen + TAG_SIZE; + return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT); +} + +int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen, + unsigned char* nsec, const unsigned char* c, + unsigned long long clen, const unsigned char* ad, + unsigned long long adlen, const unsigned char* npub, + const unsigned char *k) { + (void)nsec; + *mlen = clen - TAG_SIZE; + return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT); +} diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.h new file mode 100644 index 0000000..bcb4f36 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.h @@ -0,0 +1,13 @@ +#ifndef GIFT128_H_ +#define GIFT128_H_ + +#define KEY_SIZE 16 +#define GIFT128_BLOCK_SIZE 16 + +typedef unsigned char u8; +typedef unsigned int u32; + +extern void gift128_keyschedule(const u8* key, u32* rkey); +extern void giftb128_encrypt_block(u8* out_block, const u32* rkey, const u8* in_block); + +#endif // GIFT128_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.s b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.s new file mode 100644 index 0000000..80e2ff8 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_compact/giftb128.s @@ -0,0 +1,525 @@ +/**************************************************************************** +* Compact ARM assembly implementation of the GIFT-128 block cipher. This +* implementation focuses on code size rather than speed. +* See "Fixslicing: A New GIFT Representation" paper available at +* https:// for more details. +****************************************************************************/ + +.syntax unified +.thumb + +/***************************************************************************** +* Round constants look-up table according to the fixsliced representation. +*****************************************************************************/ +.align 2 +.type rconst,%object +rconst: +.word 0x10000008, 0x80018000, 0x54000002, 0x01010181 +.word 0x8000001f, 0x10888880, 0x6001e000, 0x51500002 +.word 0x03030180, 0x8000002f, 0x10088880, 0x60016000 +.word 0x41500002, 0x03030080, 0x80000027, 0x10008880 +.word 0x4001e000, 0x11500002, 0x03020180, 0x8000002b +.word 0x10080880, 0x60014000, 0x01400002, 0x02020080 +.word 0x80000021, 0x10000080, 0x0001c000, 0x51000002 +.word 0x03010180, 0x8000002e, 0x10088800, 0x60012000 +.word 0x40500002, 0x01030080, 0x80000006, 0x10008808 +.word 0xc001a000, 0x14500002, 0x01020181, 0x8000001a + +.align 2 +key_update: + and r2, r10, r7, lsr #12 + and r3, r7, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r7, lsr #2 + orr r2, r2, r3 + and r7, r7, #0x00030000 + orr r7, r2, r7, lsl #14 + strd r5, r7, [r1], #8 //store rkeys after 1st key update + and r2, r10, r6, lsr #12 + and r3, r6, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r6, lsr #2 + orr r2, r2, r3 + and r6, r6, #0x00030000 + orr r6, r2, r6, lsl #14 + strd r4, r6, [r1], #8 //store rkeys after 2nd key update + and r2, r10, r5, lsr #12 + and r3, r5, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r5, lsr #2 + orr r2, r2, r3 + and r5, r5, #0x00030000 + orr r5, r2, r5, lsl #14 + strd r7, r5, [r1], #8 //store rkeys after 3rd key update + and r2, r10, r4, lsr #12 + and r3, r4, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r4, lsr #2 + orr r2, r2, r3 + and r4, r4, #0x00030000 + orr r4, r2, r4, lsl #14 + strd r6, r4, [r1], #8 //store rkeys after 4th key update + bx lr + +.align 2 +rearrange_rkey_0: + ldrd r6, r4, [r1] + eor r12, r6, r6, lsr #9 + and r12, r12, r3 + eor r6, r12 + eor r6, r6, r12, lsl #9 //SWAPMOVE(r6, r6, 0x00550055, 9); + eor r12, r4, r4, lsr #9 + and r12, r12, r3 + eor r4, r12 + eor r4, r4, r12, lsl #9 //SWAPMOVE(r4, r4, 0x00550055, 9); + eor r12, r6, r6, lsr #18 + and r12, r12, r10 + eor r6, r12 + eor r6, r6, r12, lsl #18 //SWAPMOVE(r6, r6, 0x3333, 18); + eor r12, r4, r4, lsr #18 + and r12, r12, r10 + eor r4, r12 + eor r4, r4, r12, lsl #18 //SWAPMOVE(r4, r4, 0x3333, 18); + eor r12, r6, r6, lsr #12 + and r12, r12, r11 + eor r6, r12 + eor r6, r6, r12, lsl #12 //SWAPMOVE(r6, r6, 0x000f000f, 12); + eor r12, r4, r4, lsr #12 + and r12, r12, r11 + eor r4, r12 + eor r4, r4, r12, lsl #12 //SWAPMOVE(r4, r4, 0x000f000f, 12); + eor r12, r6, r6, lsr #24 + and r12, r12, #0xff + eor r6, r12 + eor r6, r6, r12, lsl #24 //SWAPMOVE(r6, r6, 0x000000ff, 24); + eor r12, r4, r4, lsr #24 + and r12, r12, #0xff + eor r4, r12 + eor r4, r4, r12, lsl #24 //SWAPMOVE(r4, r4, 0x000000ff, 24); + strd r6, r4, [r1] + bx lr + +.align 2 +rearrange_rkey_1: + ldrd r5, r7, [r1] + eor r8, r7, r7, lsr #3 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #3 //SWAPMOVE(r7, r7, 0x11111111, 3); + eor r8, r5, r5, lsr #3 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #3 //SWAPMOVE(r5, r5, 0x11111111, 3); + eor r8, r7, r7, lsr #6 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #6 //SWAPMOVE(r7, r7, 0x03030303, 6); + eor r8, r5, r5, lsr #6 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #6 //SWAPMOVE(r5, r5, 0x03030303, 6); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + strd r5, r7, [r1] + bx lr + +.align 2 +rearrange_rkey_2: + ldrd r5, r7, [r1] + eor r8, r7, r7, lsr #15 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #15 //SWAPMOVE(r7, r7, 0x0000aaaa, 15); + eor r8, r5, r5, lsr #15 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #15 //SWAPMOVE(r5, r5, 0x0000aaaa, 15); + eor r8, r7, r7, lsr #18 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #18 //SWAPMOVE(r7, r7, 0x00003333, 18); + eor r8, r5, r5, lsr #18 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #18 //SWAPMOVE(r5, r5, 0x00003333, 18); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x00000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + strd r5, r7, [r1] + bx lr + +.align 2 +rearrange_rkey_3: + ldrd r5, r7, [r1] + eor r8, r7, r7, lsr #3 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #3 //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3); + eor r8, r5, r5, lsr #3 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3); + eor r8, r7, r7, lsr #6 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #6 //SWAPMOVE(r7, r7, 0x00cc00cc, 6); + eor r8, r5, r5, lsr #6 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #6 //SWAPMOVE(r5, r5, 0x00cc00cc, 6); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + strd r5, r7, [r1] + bx lr + +/***************************************************************************** +* Code size optimized implementation of the GIFTb-128 key schedule. +* Compute the key schedule in the normal representation and then rearrange all +* the round keys in their respective fixsliced representations. +*****************************************************************************/ +.align 2 +@ void gift128_keyschedule(const u8* key, u32* rkey) +.global gift128_keyschedule +.type gift128_keyschedule,%function +gift128_keyschedule: + push {r1-r12, r14} + ldm r0, {r4-r7} //load key words + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + strd r7, r5, [r1], #8 //the first rkeys are not updated + strd r6, r4, [r1], #8 //the first rkeys are not updated + // keyschedule using classical representation for the first 20 rounds + movw r12, #0x3fff + lsl r12, r12, #16 //r12<- 0x3fff0000 + movw r10, #0x000f //r10<- 0x0000000f + movw r9, #0x0fff //r9 <- 0x00000fff + bl key_update + bl key_update + bl key_update + bl key_update + bl key_update + bl key_update + bl key_update + bl key_update + bl key_update + and r2, r10, r7, lsr #12 + and r3, r7, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r7, lsr #2 + orr r2, r2, r3 + and r7, r7, #0x00030000 + orr r7, r2, r7, lsl #14 + strd r5, r7, [r1], #8 //penultimate key update + and r2, r10, r6, lsr #12 + and r3, r6, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r6, lsr #2 + orr r2, r2, r3 + and r6, r6, #0x00030000 + orr r6, r2, r6, lsl #14 + strd r4, r6, [r1], #8 //ultimate key update + sub.w r1, r1, #320 + // rearrange the rkeys to their respective new representations + movw r3, #0x0055 + movt r3, #0x0055 //r3 <- 0x00550055 + movw r10, #0x3333 //r10<- 0x00003333 + movw r11, #0x000f + movt r11, #0x000f //r11<- 0x000f000f + bl rearrange_rkey_0 + add.w r1, r1, #40 + bl rearrange_rkey_0 + add.w r1, r1, #40 + bl rearrange_rkey_0 + add.w r1, r1, #40 + bl rearrange_rkey_0 + add.w r1, r1, #40 + bl rearrange_rkey_0 + add.w r1, r1, #40 + bl rearrange_rkey_0 + add.w r1, r1, #40 + bl rearrange_rkey_0 + add.w r1, r1, #40 + bl rearrange_rkey_0 + sub.w r1, r1, #272 + movw r3, #0x1111 + movt r3, #0x1111 //r3 <- 0x11111111 + movw r10, #0x0303 + movt r10, #0x0303 //r10<- 0x03030303 + bl rearrange_rkey_1 + add.w r1, r1, #40 + bl rearrange_rkey_1 + add.w r1, r1, #40 + bl rearrange_rkey_1 + add.w r1, r1, #40 + bl rearrange_rkey_1 + add.w r1, r1, #40 + bl rearrange_rkey_1 + add.w r1, r1, #40 + bl rearrange_rkey_1 + add.w r1, r1, #40 + bl rearrange_rkey_1 + add.w r1, r1, #40 + bl rearrange_rkey_1 + sub.w r1, r1, #272 + movw r3, #0xaaaa //r3 <- 0x0000aaaa + movw r10, #0x3333 //r10<- 0x00003333 + movw r11, #0xf0f0 //r11<- 0x0000f0f0 + bl rearrange_rkey_2 + add.w r1, r1, #40 + bl rearrange_rkey_2 + add.w r1, r1, #40 + bl rearrange_rkey_2 + add.w r1, r1, #40 + bl rearrange_rkey_2 + add.w r1, r1, #40 + bl rearrange_rkey_2 + add.w r1, r1, #40 + bl rearrange_rkey_2 + add.w r1, r1, #40 + bl rearrange_rkey_2 + add.w r1, r1, #40 + bl rearrange_rkey_2 + sub.w r1, r1, #272 + movw r3, #0x0a0a + movt r3, #0x0a0a //r3 <- 0x0a0a0a0a + movw r10, #0x00cc + movt r10, #0x00cc //r10<- 0x00cc00cc + bl rearrange_rkey_3 + add.w r1, r1, #40 + bl rearrange_rkey_3 + add.w r1, r1, #40 + bl rearrange_rkey_3 + add.w r1, r1, #40 + bl rearrange_rkey_3 + add.w r1, r1, #40 + bl rearrange_rkey_3 + add.w r1, r1, #40 + bl rearrange_rkey_3 + add.w r1, r1, #40 + bl rearrange_rkey_3 + add.w r1, r1, #40 + bl rearrange_rkey_3 + pop {r1-r12, r14} + bx lr + +.align 2 +quintuple_round: + str.w r14, [sp] + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r11, r9 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + and r8, r4, r12, lsr #1 //permutation layer + and r12, r12, r2 + orr r12, r8, r12, lsl #3 //r12<- NIBBLE_ROR(r12, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //r11<- NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //r14 <- 0x33333333 + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //r10<- NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + mvn r14, r3, lsl #12 //r0 <- 0x0fff0fff + and r8, r14, r9, lsr #4 + and r9, r9, r3 + orr r9, r8, r9, lsl #12 //r9 <- HALF_ROR(r9, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //r11<- HALF_ROR(r11, 12) + rev16 r10, r10 //r10<- HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + orr r14, r2, r2, lsl #2 //r14 <- 0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r12, r12, lsr #1 + and r8, r8, r14, lsr #16 + eor r12, r12, r8 + eor r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x55550000, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x00005555, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r9, r9, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + and r8, r11, r12, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #16 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r14, r3, r3, lsl #8 //r14 <- 0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //r10<- BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //r14 <- 0x3f3f3f3f for BYTE_ROR + mvn r8, r14 //r8 <- 0xc0c0c0c0 for BYTE_ROR + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //r11<- BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r9, lsr #6 + and r9, r14, r9 + orr r9, r8, r9, lsl #2 //r9 <- BYTE_ROR(r9, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + ldr.w r5, [r0], #4 + ldr.w r6, [r1], #4 //load rkey + ldr.w r7, [r1], #4 //load rkey + ldr.w lr, [sp] //restore link register + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12, ror #24 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r9, r9, r5 //add rconst + eor r9, r9, r12 //swap r9 with r12 + eor r12, r12, r9 //swap r9 with r12 + eor r9, r9, r12 //swap r9 with r12 + bx lr + +/***************************************************************************** +* Code size optimized implementation of the GIFTb-128 block cipher. +* This function simply encrypts a 128-bit block, without any operation mode. +*****************************************************************************/ +.align 2 +@ void giftb128_encrypt_block(u8 *out, const u32* rkey, const u8 *block) +.global giftb128_encrypt_block +.type giftb128_encrypt_block,%function +giftb128_encrypt_block: + push {r0,r2-r12,r14} + sub.w sp, #4 //to store 'lr' when calling 'quintuple_round' + ldm r2, {r9-r12} // load plaintext words + rev r9, r9 + rev r10, r10 + rev r11, r11 + rev r12, r12 + movw r2, #0x1111 + movt r2, #0x1111 //r2 <- 0x11111111 (for NIBBLE_ROR) + movw r3, #0x000f + movt r3, #0x000f //r3 <- 0x000f000f (for HALF_ROR) + mvn r4, r2, lsl #3 //r4 <- 0x7777777 (for NIBBLE_ROR) + adr r0, rconst //r0 <- 'rconst' address + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + bl quintuple_round + ldr.w r0, [sp ,#4] //restore 'ctext' address + rev r9, r9 + rev r10, r10 + rev r11, r11 + rev r12, r12 + stm r0, {r9-r12} + add.w sp, #4 + pop {r0,r2-r12,r14} + bx lr + \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/api.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/api.h new file mode 100644 index 0000000..fb1d58b --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/api.h @@ -0,0 +1,5 @@ +#define CRYPTO_KEYBYTES 16 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 16 +#define CRYPTO_ABYTES 16 +#define CRYPTO_NOOVERLAP 1 diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/cofb.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/cofb.h new file mode 100644 index 0000000..c580057 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/cofb.h @@ -0,0 +1,66 @@ +#ifndef GIFT_COFB_H_ +#define GIFT_COFB_H_ + +#define TAG_SIZE 16 +#define COFB_ENCRYPT 1 +#define COFB_DECRYPT 0 + +#define DOUBLE_HALF_BLOCK(x) ({ \ + tmp0 = (x)[0]; \ + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); \ + (x)[0] |= ((x)[1] & 0x80808080) << 17; \ + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); \ + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; \ +}) + +#define TRIPLE_HALF_BLOCK(x) ({ \ + tmp0 = (x)[0]; \ + tmp1 = (x)[1]; \ + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); \ + (x)[0] |= ((x)[1] & 0x80808080) << 17; \ + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); \ + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; \ + (x)[0] ^= tmp0; \ + (x)[1] ^= tmp1; \ +}) + +#define G(x) ({ \ + tmp0 = (x)[0]; \ + tmp1 = (x)[1]; \ + (x)[0] = (x)[2]; \ + (x)[1] = (x)[3]; \ + (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15); \ + (x)[2] |= ((tmp1 & 0x80808080) << 17); \ + (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15); \ + (x)[3] |= ((tmp0 & 0x80808080) << 17); \ +}) + +#define XOR_BLOCK(x, y, z) ({ \ + (x)[0] = (y)[0] ^ (z)[0]; \ + (x)[1] = (y)[1] ^ (z)[1]; \ + (x)[2] = (y)[2] ^ (z)[2]; \ + (x)[3] = (y)[3] ^ (z)[3]; \ +}) + +#define XOR_TOP_BAR_BLOCK(x, y) ({ \ + (x)[0] ^= (y)[0]; \ + (x)[1] ^= (y)[1]; \ +}) + +#define RHO1(d, y, m, n) ({ \ + G(y); \ + padding(d,m,n); \ + XOR_BLOCK(d, d, y); \ +}) + +#define RHO(y, m, x, c, n) ({ \ + XOR_BLOCK(c, y, m); \ + RHO1(x, y, m, n); \ +}) + +#define RHO_PRIME(y, c, x, m, n) ({ \ + XOR_BLOCK(m, y, c); \ + RHO1(x, y, m, n); \ +}) + +#endif // GIFT_COFB_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/encrypt.c b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/encrypt.c new file mode 100644 index 0000000..8eed961 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/encrypt.c @@ -0,0 +1,141 @@ +#include +#include "cofb.h" +#include "giftb128.h" + +static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){ + u32 i; + if (no_of_bytes == 0) { + d[0] = 0x00000080; // little-endian + d[1] = 0x00000000; + d[2] = 0x00000000; + d[3] = 0x00000000; + } + else if (no_of_bytes < GIFT128_BLOCK_SIZE) { + for (i = 0; i < no_of_bytes/4+1; i++) + d[i] = s[i]; + d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8); + d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8; + for (; i < 4; i++) + d[i] = 0x00000000; + } + else { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + } +} + +/**************************************************************************** +* Constant-time implementation of the GIFT-COFB authenticated cipher based on +* fixsliced GIFTb-128. Encryption/decryption is handled by the same function, +* depending on the 'mode' parameter (1/0). + ***************************************************************************/ +int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad, + u32 ad_len, const u8* in, u32 in_len, const int encrypting) { + + u32 tmp0, tmp1, emptyA, emptyM, offset[2]; + u32 input[4], rkey[80]; + u8 Y[GIFT128_BLOCK_SIZE]; + + if (!encrypting) { + if (in_len < TAG_SIZE) + return -1; + in_len -= TAG_SIZE; + } + + if(ad_len == 0) + emptyA = 1; + else + emptyA = 0; + + if(in_len == 0) + emptyM =1; + else + emptyM = 0; + + gift128_keyschedule(key, rkey); + giftb128_encrypt_block(Y, rkey, nonce); + offset[0] = ((u32*)Y)[0]; + offset[1] = ((u32*)Y)[1]; + + while(ad_len > GIFT128_BLOCK_SIZE){ + RHO1(input, (u32*)Y, (u32*)ad, GIFT128_BLOCK_SIZE); + DOUBLE_HALF_BLOCK(offset); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + ad += GIFT128_BLOCK_SIZE; + ad_len -= GIFT128_BLOCK_SIZE; + } + + TRIPLE_HALF_BLOCK(offset); + if((ad_len % GIFT128_BLOCK_SIZE != 0) || (emptyA)) + TRIPLE_HALF_BLOCK(offset); + if(emptyM) { + TRIPLE_HALF_BLOCK(offset); + TRIPLE_HALF_BLOCK(offset); + } + + RHO1(input, (u32*)Y, (u32*)ad, ad_len); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + + while (in_len > GIFT128_BLOCK_SIZE){ + DOUBLE_HALF_BLOCK(offset); + if (encrypting) + RHO((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE); + else + RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, GIFT128_BLOCK_SIZE); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + in += GIFT128_BLOCK_SIZE; + out += GIFT128_BLOCK_SIZE; + in_len -= GIFT128_BLOCK_SIZE; + } + + if(!emptyM){ + TRIPLE_HALF_BLOCK(offset); + if(in_len % GIFT128_BLOCK_SIZE != 0) + TRIPLE_HALF_BLOCK(offset); + if (encrypting) { + RHO((u32*)Y, (u32*)in, input, (u32*)out, in_len); + out += in_len; + } + else { + RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, in_len); + in += in_len; + } + XOR_TOP_BAR_BLOCK(input, offset); + giftb128_encrypt_block(Y, rkey, (u8*)input); + } + + if (encrypting) { // encryption mode + memcpy(out, Y, TAG_SIZE); + return 0; + } + // decrypting + tmp0 = 0; + for(tmp1 = 0; tmp1 < TAG_SIZE; tmp1++) + tmp0 |= in[tmp1] ^ Y[tmp1]; + return tmp0; +} + +int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen, + const unsigned char* m, unsigned long long mlen, + const unsigned char* ad, unsigned long long adlen, + const unsigned char* nsec, const unsigned char* npub, + const unsigned char* k) { + (void)nsec; + *clen = mlen + TAG_SIZE; + return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT); +} + +int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen, + unsigned char* nsec, const unsigned char* c, + unsigned long long clen, const unsigned char* ad, + unsigned long long adlen, const unsigned char* npub, + const unsigned char *k) { + (void)nsec; + *mlen = clen - TAG_SIZE; + return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT); +} diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.h new file mode 100644 index 0000000..8c904b6 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.h @@ -0,0 +1,13 @@ +#ifndef GIFT128_H_ +#define GIFT128_H_ + +#define KEY_SIZE 16 +#define GIFT128_BLOCK_SIZE 16 + +typedef unsigned char u8; +typedef unsigned int u32; + +extern void gift128_keyschedule(const u8* key, u32* rkey); +extern void giftb128_encrypt_block(u8* out_block, const u32* rkey, const u8* in_block); + +#endif // GIFT128_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.s b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.s new file mode 100644 index 0000000..0d5d8e0 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/armcortexm_fast/giftb128.s @@ -0,0 +1,2044 @@ +/**************************************************************************** +* Fully unrolled ARM assembly implementation of the GIFTn-128 block cipher. +* This implementation focuses on speed, at the cost of a large code size. +* See "Fixslicing: A New GIFT Representation" paper available at +* https:// for more details. +* +* @author Alexandre Adomnicai, Nanyang Technological University, +* alexandre.adomnicai@ntu.edu.sg +* @date March 2020 +****************************************************************************/ + +.syntax unified +.thumb +/***************************************************************************** +* Fully unrolled implementation of the GIFT-128 key schedule according to the +* fixsliced representation. +*****************************************************************************/ +@ void gift128_keyschedule(const u8* key, u32* rkey) +.global gift128_keyschedule +.type gift128_keyschedule,%function +gift128_keyschedule: + push {r2-r12, r14} + ldm r0, {r4-r7} //load key words + rev r4, r4 + rev r5, r5 + rev r6, r6 + rev r7, r7 + str.w r6, [r1, #8] + str.w r4, [r1, #12] + str.w r7, [r1] + str.w r5, [r1, #4] + // keyschedule using classical representation for the first 20 rounds + movw r12, #0x3fff + lsl r12, r12, #16 //r12<- 0x3fff0000 + movw r10, #0x000f //r10<- 0x0000000f + movw r9, #0x0fff //r9 <- 0x00000fff + // 1st classical key update + and r2, r10, r7, lsr #12 + and r3, r7, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r7, lsr #2 + orr r2, r2, r3 + and r7, r7, #0x00030000 + orr r7, r2, r7, lsl #14 + str.w r5, [r1, #16] + str.w r7, [r1, #20] + // 2nd classical key update + and r2, r10, r6, lsr #12 + and r3, r6, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r6, lsr #2 + orr r2, r2, r3 + and r6, r6, #0x00030000 + orr r6, r2, r6, lsl #14 + str.w r4, [r1, #24] + str.w r6, [r1, #28] + // 3rd classical key update + and r2, r10, r5, lsr #12 + and r3, r5, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r5, lsr #2 + orr r2, r2, r3 + and r5, r5, #0x00030000 + orr r5, r2, r5, lsl #14 + str.w r7, [r1, #32] + str.w r5, [r1, #36] + // 4th classical key update + and r2, r10, r4, lsr #12 + and r3, r4, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r4, lsr #2 + orr r2, r2, r3 + and r4, r4, #0x00030000 + orr r4, r2, r4, lsl #14 + str.w r6, [r1, #40] + str.w r4, [r1, #44] + // 5th classical key update + and r2, r10, r7, lsr #12 + and r3, r7, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r7, lsr #2 + orr r2, r2, r3 + and r7, r7, #0x00030000 + orr r7, r2, r7, lsl #14 + str.w r5, [r1, #48] + str.w r7, [r1, #52] + // 6th classical key update + and r2, r10, r6, lsr #12 + and r3, r6, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r6, lsr #2 + orr r2, r2, r3 + and r6, r6, #0x00030000 + orr r6, r2, r6, lsl #14 + str.w r4, [r1, #56] + str.w r6, [r1, #60] + // 7th classical key update + and r2, r10, r5, lsr #12 + and r3, r5, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r5, lsr #2 + orr r2, r2, r3 + and r5, r5, #0x00030000 + orr r5, r2, r5, lsl #14 + str.w r7, [r1, #64] + str.w r5, [r1, #68] + // 8th classical key update + and r2, r10, r4, lsr #12 + and r3, r4, r9 + orr r2, r2, r3, lsl #4 + and r3, r12, r4, lsr #2 + orr r2, r2, r3 + and r4, r4, #0x00030000 + orr r4, r2, r4, lsl #14 + str.w r6, [r1, #72] + str.w r4, [r1, #76] + // rearrange the rkeys to their respective new representations + // REARRANGE_RKEY_0 + movw r3, #0x0055 + movt r3, #0x0055 //r3 <- 0x00550055 + movw r10, #0x3333 //r10<- 0x00003333 + movw r11, #0x000f + movt r11, #0x000f //r11<- 0x000f000f + ldrd r6, r4, [r1] + eor r12, r6, r6, lsr #9 + and r12, r12, r3 + eor r6, r12 + eor r6, r6, r12, lsl #9 //SWAPMOVE(r6, r6, 0x00550055, 9); + eor r12, r4, r4, lsr #9 + and r12, r12, r3 + eor r4, r12 + eor r4, r4, r12, lsl #9 //SWAPMOVE(r4, r4, 0x00550055, 9); + eor r12, r6, r6, lsr #18 + and r12, r12, r10 + eor r6, r12 + eor r6, r6, r12, lsl #18 //SWAPMOVE(r6, r6, 0x3333, 18); + eor r12, r4, r4, lsr #18 + and r12, r12, r10 + eor r4, r12 + eor r4, r4, r12, lsl #18 //SWAPMOVE(r4, r4, 0x3333, 18); + eor r12, r6, r6, lsr #12 + and r12, r12, r11 + eor r6, r12 + eor r6, r6, r12, lsl #12 //SWAPMOVE(r6, r6, 0x000f000f, 12); + eor r12, r4, r4, lsr #12 + and r12, r12, r11 + eor r4, r12 + eor r4, r4, r12, lsl #12 //SWAPMOVE(r4, r4, 0x000f000f, 12); + eor r12, r6, r6, lsr #24 + and r12, r12, #0xff + eor r6, r12 + eor r6, r6, r12, lsl #24 //SWAPMOVE(r6, r6, 0x000000ff, 24); + eor r12, r4, r4, lsr #24 + and r12, r12, #0xff + eor r4, r12 + eor r4, r4, r12, lsl #24 //SWAPMOVE(r4, r4, 0x000000ff, 24); + strd r6, r4, [r1] + ldrd r6, r4, [r1, #40] + eor r12, r6, r6, lsr #9 + and r12, r12, r3 + eor r6, r12 + eor r6, r6, r12, lsl #9 //SWAPMOVE(r6, r6, 0x00550055, 9); + eor r12, r4, r4, lsr #9 + and r12, r12, r3 + eor r4, r12 + eor r4, r4, r12, lsl #9 //SWAPMOVE(r4, r4, 0x00550055, 9); + eor r12, r6, r6, lsr #18 + and r12, r12, r10 + eor r6, r12 + eor r6, r6, r12, lsl #18 //SWAPMOVE(r6, r6, 0x3333, 18); + eor r12, r4, r4, lsr #18 + and r12, r12, r10 + eor r4, r12 + eor r4, r4, r12, lsl #18 //SWAPMOVE(r4, r4, 0x3333, 18); + eor r12, r6, r6, lsr #12 + and r12, r12, r11 + eor r6, r12 + eor r6, r6, r12, lsl #12 //SWAPMOVE(r6, r6, 0x000f000f, 12); + eor r12, r4, r4, lsr #12 + and r12, r12, r11 + eor r4, r12 + eor r4, r4, r12, lsl #12 //SWAPMOVE(r4, r4, 0x000f000f, 12); + eor r12, r6, r6, lsr #24 + and r12, r12, #0xff + eor r6, r12 + eor r6, r6, r12, lsl #24 //SWAPMOVE(r6, r6, 0x000000ff, 24); + eor r12, r4, r4, lsr #24 + and r12, r12, #0xff + eor r4, r12 + eor r4, r4, r12, lsl #24 //SWAPMOVE(r4, r4, 0x000000ff, 24); + str.w r6, [r1, #40] + str.w r4, [r1, #44] + // REARRANGE_RKEY_1 + movw r3, #0x1111 + movt r3, #0x1111 + movw r10, #0x0303 + movt r10, #0x0303 + ldrd r5, r7, [r1, #8] + eor r8, r7, r7, lsr #3 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #3 //SWAPMOVE(r7, r7, 0x11111111, 3); + eor r8, r5, r5, lsr #3 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #3 //SWAPMOVE(r5, r5, 0x11111111, 3); + eor r8, r7, r7, lsr #6 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #6 //SWAPMOVE(r7, r7, 0x03030303, 6); + eor r8, r5, r5, lsr #6 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #6 //SWAPMOVE(r5, r5, 0x03030303, 6); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + ldr.w r12, [r1, #48] + ldr.w r14, [r1, #52] + str.w r5, [r1, #8] + str.w r7, [r1, #12] + eor r8, r14, r14, lsr #3 + and r8, r8, r3 + eor r14, r8 + eor r14, r14, r8, lsl #3 //SWAPMOVE(r7, r7, 0x11111111, 3); + eor r8, r12, r12, lsr #3 + and r8, r8, r3 + eor r12, r8 + eor r12, r12, r8, lsl #3 //SWAPMOVE(r5, r5, 0x11111111, 3); + eor r8, r14, r14, lsr #6 + and r8, r8, r10 + eor r14, r8 + eor r14, r14, r8, lsl #6 //SWAPMOVE(r7, r7, 0x03030303, 6); + eor r8, r12, r12, lsr #6 + and r8, r8, r10 + eor r12, r8 + eor r12, r12, r8, lsl #6 //SWAPMOVE(r5, r5, 0x03030303, 6); + eor r8, r14, r14, lsr #12 + and r8, r8, r11 + eor r14, r8 + eor r14, r14, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r12, r12, lsr #12 + and r8, r8, r11 + eor r12, r8 + eor r12, r12, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r14, r14, lsr #24 + and r8, r8, #0xff + eor r14, r8 + eor r14, r14, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r12, r12, lsr #24 + and r8, r8, #0xff + eor r12, r8 + eor r12, r12, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + str.w r12, [r1, #48] + str.w r14, [r1, #52] + // REARRANGE_RKEY_2 + movw r3, #0xaaaa + movw r10, #0x3333 + movw r11, #0xf0f0 + ldrd r5, r7, [r1, #16] + eor r8, r7, r7, lsr #15 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #15 //SWAPMOVE(r7, r7, 0x0000aaaa, 15); + eor r8, r5, r5, lsr #15 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #15 //SWAPMOVE(r5, r5, 0x0000aaaa, 15); + eor r8, r7, r7, lsr #18 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #18 //SWAPMOVE(r7, r7, 0x00003333, 18); + eor r8, r5, r5, lsr #18 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #18 //SWAPMOVE(r5, r5, 0x00003333, 18); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x00000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + strd r5, r7, [r1, #16] + ldrd r5, r7, [r1, #56] + eor r8, r7, r7, lsr #15 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #15 //SWAPMOVE(r7, r7, 0x0000aaaa, 15); + eor r8, r5, r5, lsr #15 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #15 //SWAPMOVE(r5, r5, 0x0000aaaa, 15); + eor r8, r7, r7, lsr #18 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #18 //SWAPMOVE(r7, r7, 0x00003333, 18); + eor r8, r5, r5, lsr #18 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #18 //SWAPMOVE(r5, r5, 0x00003333, 18); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + str.w r5, [r1, #56] + str.w r7, [r1, #60] + // REARRANGE_RKEY_3 + movw r3, #0x0a0a + movt r3, #0x0a0a //r3 <- 0x0a0a0a0a + movw r10, #0x00cc + movt r10, #0x00cc //r10<- 0x00cc00cc + ldrd r5, r7, [r1, #24] + eor r8, r7, r7, lsr #3 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #3 //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3); + eor r8, r5, r5, lsr #3 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3); + eor r8, r7, r7, lsr #6 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #6 //SWAPMOVE(r7, r7, 0x00cc00cc, 6); + eor r8, r5, r5, lsr #6 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #6 //SWAPMOVE(r5, r5, 0x00cc00cc, 6); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x000000ff, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x000000ff, 24); + strd r5, r7, [r1, #24] + ldrd r5, r7, [r1, #64] + eor r8, r7, r7, lsr #3 + and r8, r8, r3 + eor r7, r8 + eor r7, r7, r8, lsl #3 //SWAPMOVE(r7, r7, 0x0a0a0a0a, 3); + eor r8, r5, r5, lsr #3 + and r8, r8, r3 + eor r5, r8 + eor r5, r5, r8, lsl #3 //SWAPMOVE(r5, r5, 0x0a0a0a0a, 3); + eor r8, r7, r7, lsr #6 + and r8, r8, r10 + eor r7, r8 + eor r7, r7, r8, lsl #6 //SWAPMOVE(r7, r7, 0x00cc00cc, 6); + eor r8, r5, r5, lsr #6 + and r8, r8, r10 + eor r5, r8 + eor r5, r5, r8, lsl #6 //SWAPMOVE(r5, r5, 0x00cc00cc, 6); + eor r8, r7, r7, lsr #12 + and r8, r8, r11 + eor r7, r8 + eor r7, r7, r8, lsl #12 //SWAPMOVE(r7, r7, 0x000f000f, 12); + eor r8, r5, r5, lsr #12 + and r8, r8, r11 + eor r5, r8 + eor r5, r5, r8, lsl #12 //SWAPMOVE(r5, r5, 0x000f000f, 12); + eor r8, r7, r7, lsr #24 + and r8, r8, #0xff + eor r7, r8 + eor r7, r7, r8, lsl #24 //SWAPMOVE(r7, r7, 0x0000ff00, 24); + eor r8, r5, r5, lsr #24 + and r8, r8, #0xff + eor r5, r8 + eor r5, r5, r8, lsl #24 //SWAPMOVE(r5, r5, 0x0000ff00, 24); + str.w r5, [r1, #64] + str.w r7, [r1, #68] + //keyschedule according to the new representations + // KEY_DOULBE/TRIPLE_UPDATE_0 + movw r10, #0x3333 + eor r12, r10, r10, lsl #16 + mvn r11, r12 + movw r9, #0x4444 + movt r9, #0x5555 + movw r8, #0x1100 + movt r8, #0x5555 + ldrd r4, r5, [r1] + and r2, r12, r4, ror #24 + and r4, r4, r11 + orr r4, r2, r4, ror #16 //KEY_TRIPLE_UPDATE_1(r4) + eor r2, r4, r4, lsr #1 + and r2, r2, r8 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x55551100, 1) + eor r2, r5, r5, lsr #16 + and r2, r2, r10 + eor r5, r5, r2 + eor r5, r5, r2, lsl #16 //SWAPMOVE(r5, r5, 0x00003333, 16) + eor r2, r5, r5, lsr #1 + and r2, r2, r9 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x555544444, 1) + str.w r5, [r1, #80] + str.w r4, [r1, #84] + and r2, r12, r5, ror #24 + and r5, r5, r11 + orr r5, r2, r5, ror #16 //KEY_TRIPLE_UPDATE_1(r5) + eor r2, r5, r5, lsr #1 + and r2, r2, r8 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x55551100, 1) + eor r2, r4, r4, lsr #16 + and r2, r2, r10 + eor r4, r4, r2 + eor r4, r4, r2, lsl #16 //SWAPMOVE(r4, r4, 0x00003333, 16) + eor r2, r4, r4, lsr #1 + and r2, r2, r9 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x555544444, 1) + str.w r4, [r1, #160] + str.w r5, [r1, #164] + and r2, r12, r4, ror #24 + and r4, r4, r11 + orr r4, r2, r4, ror #16 //KEY_TRIPLE_UPDATE_1(r4) + eor r2, r4, r4, lsr #1 + and r2, r2, r8 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x55551100, 1) + eor r2, r5, r5, lsr #16 + and r2, r2, r10 + eor r5, r5, r2 + eor r5, r5, r2, lsl #16 //SWAPMOVE(r5, r5, 0x00003333, 16) + eor r2, r5, r5, lsr #1 + and r2, r2, r9 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x555544444, 1) + strd r5, r4, [r1, #240] + ldrd r4, r5, [r1, #40] + and r2, r12, r4, ror #24 + and r4, r4, r11 + orr r4, r2, r4, ror #16 //KEY_TRIPLE_UPDATE_1(r4) + eor r2, r4, r4, lsr #1 + and r2, r2, r8 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x55551100, 1) + eor r2, r5, r5, lsr #16 + and r2, r2, r10 + eor r5, r5, r2 + eor r5, r5, r2, lsl #16 //SWAPMOVE(r5, r5, 0x00003333, 16) + eor r2, r5, r5, lsr #1 + and r2, r2, r9 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x555544444, 1) + str.w r5, [r1, #120] + str.w r4, [r1, #124] + and r2, r12, r5, ror #24 + and r5, r5, r11 + orr r5, r2, r5, ror #16 //KEY_TRIPLE_UPDATE_1(r5) + eor r2, r5, r5, lsr #1 + and r2, r2, r8 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x55551100, 1) + eor r2, r4, r4, lsr #16 + and r2, r2, r10 + eor r4, r4, r2 + eor r4, r4, r2, lsl #16 //SWAPMOVE(r4, r4, 0x00003333, 16) + eor r2, r4, r4, lsr #1 + and r2, r2, r9 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x555544444, 1) + str.w r4, [r1, #200] + str.w r5, [r1, #204] + and r2, r12, r4, ror #24 + and r4, r4, r11 + orr r4, r2, r4, ror #16 //KEY_TRIPLE_UPDATE_1(r4) + eor r2, r4, r4, lsr #1 + and r2, r2, r8 + eor r4, r4, r2 + eor r4, r4, r2, lsl #1 //SWAPMOVE(r4, r4, 0x55551100, 1) + eor r2, r5, r5, lsr #16 + and r2, r2, r10 + eor r5, r5, r2 + eor r5, r5, r2, lsl #16 //SWAPMOVE(r5, r5, 0x00003333, 16) + eor r2, r5, r5, lsr #1 + and r2, r2, r9 + eor r5, r5, r2 + eor r5, r5, r2, lsl #1 //SWAPMOVE(r5, r5, 0x555544444, 1) + str.w r5, [r1, #280] + str.w r4, [r1, #284] + // KEY_DOULBE/TRIPLE_UPDATE_2 + // masks + movw r12, #0x0f00 + movt r12, #0x0f00 + movw r11, #0x0003 + movt r11, #0x0003 + movw r10, #0x003f + movt r10, #0x003f + lsl r9, r11, #8 //r9 <- 0x03000300 + and r8, r10, r10, lsr #3 //r8 <- 0x00070007 + orr r7, r8, r8, lsl #2 //r7 <- 0x001f001f + ldrd r4, r5, [r1, #8] + and r2, r9, r4, lsr #6 + and r3, r4, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #5 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r12, r5, lsr #4 + and r3, r5, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r5, lsr #6 + orr r2, r2, r3 + and r5, r5, r10 + orr r5, r2, r5, lsl #2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r5, [r1, #88] + str.w r4, [r1, #92] + and r2, r9, r5, lsr #6 + and r3, r5, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #5 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_TRIPLE_UPDATE_2(r5) + and r2, r12, r4, lsr #4 + and r3, r4, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r4, lsr #6 + orr r2, r2, r3 + and r4, r4, r10 + orr r4, r2, r4, lsl #2 //KEY_DOUBLE_UPDATE_2(r4) + str.w r4, [r1, #168] + str.w r5, [r1, #172] + and r2, r9, r4, lsr #6 + and r3, r4, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #5 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r12, r5, lsr #4 + and r3, r5, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r5, lsr #6 + orr r2, r2, r3 + and r5, r5, r10 + orr r5, r2, r5, lsl#2 //KEY_DOUBLE_UPDATE_2(r5) + strd r5, r4, [r1, #248] + ldrd r4, r5, [r1, #48] + and r2, r9, r4, lsr #6 + and r3, r4, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #5 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r12, r5, lsr #4 + and r3, r5, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r5, lsr #6 + orr r2, r2, r3 + and r5, r5, r10 + orr r5, r2, r5, lsl #2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r5, [r1, #128] + str.w r4, [r1, #132] + and r2, r9, r5, lsr #6 + and r3, r5, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #5 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_TRIPLE_UPDATE_2(r5) + and r2, r12, r4, lsr #4 + and r3, r4, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r4, lsr #6 + orr r2, r2, r3 + and r4, r4, r10 + orr r4, r2, r4, lsl #2 //KEY_DOUBLE_UPDATE_2(r4) + str.w r4, [r1, #208] + str.w r5, [r1, #212] + and r2, r9, r4, lsr #6 + and r3, r4, r10, lsl #8 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #5 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r12, r5, lsr #4 + and r3, r5, r12 + orr r2, r2, r3, lsl #4 + and r3, r11, r5, lsr #6 + orr r2, r2, r3 + and r5, r5, r10 + orr r5, r2, r5, lsl#2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r5, [r1, #288] + str.w r4, [r1, #292] + // KEY_DOULBE/TRIPLE_UPDATE_2 + // masks + movw r12, #0x5555 + movt r12, #0x5555 + mvn r11, r12 + ldrd r4, r5, [r1, #16] + and r2, r12, r4, ror #24 + and r4, r11, r4, ror #20 + orr r4, r4, r2 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r11, r5, ror #24 + and r5, r12, r5, ror #16 + orr r5, r5, r2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r5, [r1, #96] + str.w r4, [r1, #100] + and r2, r12, r5, ror #24 + and r5, r11, r5, ror #20 + orr r5, r5, r2 //KEY_TRIPLE_UPDATE_2(r5) + and r2, r11, r4, ror #24 + and r4, r12, r4, ror #16 + orr r4, r4, r2 //KEY_DOUBLE_UPDATE_2(r4) + str.w r4, [r1, #176] + str.w r5, [r1, #180] + and r2, r12, r4, ror #24 + and r4, r11, r4, ror #20 + orr r4, r4, r2 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r11, r5, ror #24 + and r5, r12, r5, ror #16 + orr r5, r5, r2 //KEY_DOUBLE_UPDATE_2(r5) + strd r5, r4, [r1, #256] + ldrd r4, r5, [r1, #56] + and r2, r12, r4, ror #24 + and r4, r11, r4, ror #20 + orr r4, r4, r2 //KEY_TRIPLE_UPDATE_2(r5) + and r2, r11, r5, ror #24 + and r5, r12, r5, ror #16 + orr r5, r5, r2 //KEY_DOUBLE_UPDATE_2(r4) + str.w r5, [r1, #136] + str.w r4, [r1, #140] + and r2, r12, r5, ror #24 + and r5, r11, r5, ror #20 + orr r5, r5, r2 //KEY_TRIPLE_UPDATE_2(r4) + and r2, r11, r4, ror #24 + and r4, r12, r4, ror #16 + orr r4, r4, r2 //KEY_DOUBLE_UPDATE_2(r5) + str.w r4, [r1, #216] + str.w r5, [r1, #220] + and r2, r12, r4, ror #24 + and r4, r11, r4, ror #20 + orr r4, r4, r2 //KEY_TRIPLE_UPDATE_2(r5) + and r2, r11, r5, ror #24 + and r5, r12, r5, ror #16 + orr r5, r5, r2 //KEY_DOUBLE_UPDATE_2(r4) + str.w r5, [r1, #296] + str.w r4, [r1, #300] + // KEY_DOULBE/TRIPLE_UPDATE_3 + // masks + orr r12, r8, r8, lsl #8 //r12<- 0x07070707 + movw r11, #0xc0c0 + movw r10, #0x3030 + and r9, r12, r12, lsr #1 //r9 <- 0x03030303 + lsl r8, r12, #4 + eor r7, r8, r9, lsl #5 + movw r6, #0xf0f0 + ldrd r4, r5, [r1, #24] + and r2, r10, r4, lsr #18 + and r3, r4, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r4, lsr #14 + orr r2, r2, r3 + and r3, r4, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7, lsr #16 + orr r4, r2, r4, lsl #19 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r9, r5, lsr #2 + and r3, r9, r5 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_DOUBLE_UPDATE_4(r5) + str.w r5, [r1, #104] + str.w r4, [r1, #108] + and r2, r10, r5, lsr #18 + and r3, r5, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r5, lsr #14 + orr r2, r2, r3 + and r3, r5, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7, lsr #16 + orr r5, r2, r5, lsl #19 //KEY_TRIPLE_UPDATE_4(r5) + and r2, r9, r4, lsr #2 + and r3, r9, r4 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_DOUBLE_UPDATE_4(r4) + str.w r4, [r1, #184] + str.w r5, [r1, #188] + and r2, r10, r4, lsr #18 + and r3, r4, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r4, lsr #14 + orr r2, r2, r3 + and r3, r4, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7, lsr #16 + orr r4, r2, r4, lsl #19 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r9, r5, lsr #2 + and r3, r9, r5 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_DOUBLE_UPDATE_4(r5) + strd r5, r4, [r1, #264] + ldrd r4, r5, [r1, #64] + and r2, r10, r4, lsr #18 + and r3, r4, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r4, lsr #14 + orr r2, r2, r3 + and r3, r4, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7, lsr #16 + orr r4, r2, r4, lsl #19 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r9, r5, lsr #2 + and r3, r9, r5 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_DOUBLE_UPDATE_4(r5) + str.w r5, [r1, #144] + str.w r4, [r1, #148] + and r2, r10, r5, lsr #18 + and r3, r5, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r5, lsr #14 + orr r2, r2, r3 + and r3, r5, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7, lsr #16 + orr r5, r2, r5, lsl #19 //KEY_TRIPLE_UPDATE_4(r5) + and r2, r9, r4, lsr #2 + and r3, r9, r4 + orr r2, r2, r3, lsl #2 + and r3, r8, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7 + orr r4, r2, r4, lsl #3 //KEY_DOUBLE_UPDATE_4(r4) + str.w r4, [r1, #224] + str.w r5, [r1, #228] + and r2, r10, r4, lsr #18 + and r3, r4, r7, lsr #4 + orr r2, r2, r3, lsl #3 + and r3, r11, r4, lsr #14 + orr r2, r2, r3 + and r3, r4, r12, lsr #11 + orr r2, r2, r3, lsl #15 + and r3, r12, r4, lsr #1 + orr r2, r2, r3 + and r4, r4, r7, lsr #16 + orr r4, r2, r4, lsl #19 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r9, r5, lsr #2 + and r3, r9, r5 + orr r2, r2, r3, lsl #2 + and r3, r8, r5, lsr #1 + orr r2, r2, r3 + and r5, r5, r7 + orr r5, r2, r5, lsl #3 //KEY_DOUBLE_UPDATE_4(r5) + str.w r5, [r1, #304] + str.w r4, [r1, #308] + // KEY_DOULBE/TRIPLE_UPDATE_4 + // masks + movw r12, #0x0fff + lsl r10, r12, #16 + movw r8, #0x00ff + movw r7, #0x03ff + lsl r7, r7, #16 + ldrd r4, r5, [r1, #32] + and r2, r7, r4, lsr #6 + and r3, r4, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r4, lsr #4 + orr r2, r2, r3 + and r4, r4, #0x000f + orr r4, r2, r4, lsl #12 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r10, r5, lsr #4 + and r3, r5, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r5, lsr #8 + orr r2, r2, r3 + and r5, r5, r8 + orr r5, r2, r5, lsl #8 //KEY_DOUBLE_UPDATE_4(r5) + str.w r5, [r1, #112] + str.w r4, [r1, #116] + and r2, r7, r5, lsr #6 + and r3, r5, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r5, lsr #4 + orr r2, r2, r3 + and r5, r5, #0x000f + orr r5, r2, r5, lsl #12 //KEY_TRIPLE_UPDATE_4(r5) + and r2, r10, r4, lsr #4 + and r3, r4, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r4, lsr #8 + orr r2, r2, r3 + and r4, r4, r8 + orr r4, r2, r4, lsl #8 //KEY_DOUBLE_UPDATE_4(r4) + str.w r4, [r1, #192] + str.w r5, [r1, #196] + and r2, r7, r4, lsr #6 + and r3, r4, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r4, lsr #4 + orr r2, r2, r3 + and r4, r4, #0x000f + orr r4, r2, r4, lsl #12 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r10, r5, lsr #4 + and r3, r5, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r5, lsr #8 + orr r2, r2, r3 + and r5, r5, r8 + orr r5, r2, r5, lsl #8 //KEY_DOUBLE_UPDATE_4(r5) + strd r5, r4, [r1, #272] + ldrd r4, r5, [r1, #72] + and r2, r7, r4, lsr #6 + and r3, r4, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r4, lsr #4 + orr r2, r2, r3 + and r4, r4, #0x000f + orr r4, r2, r4, lsl #12 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r10, r5, lsr #4 + and r3, r5, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r5, lsr #8 + orr r2, r2, r3 + and r5, r5, r8 + orr r5, r2, r5, lsl #8 //KEY_DOUBLE_UPDATE_4(r5) + str.w r5, [r1, #152] + str.w r4, [r1, #156] + and r2, r7, r5, lsr #6 + and r3, r5, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r5, lsr #4 + orr r2, r2, r3 + and r5, r5, #0x000f + orr r5, r2, r5, lsl #12 //KEY_TRIPLE_UPDATE_4(r5) + and r2, r10, r4, lsr #4 + and r3, r4, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r4, lsr #8 + orr r2, r2, r3 + and r4, r4, r8 + orr r4, r2, r4, lsl #8 //KEY_DOUBLE_UPDATE_4(r4) + str.w r4, [r1, #232] + str.w r5, [r1, #236] + and r2, r7, r4, lsr #6 + and r3, r4, #0x003f0000 + orr r2, r2, r3, lsl #10 + and r3, r12, r4, lsr #4 + orr r2, r2, r3 + and r4, r4, #0x000f + orr r4, r2, r4, lsl #12 //KEY_TRIPLE_UPDATE_4(r4) + and r2, r10, r5, lsr #4 + and r3, r5, #0x000f0000 + orr r2, r2, r3, lsl #12 + and r3, r8, r5, lsr #8 + orr r2, r2, r3 + and r5, r5, r8 + orr r5, r2, r5, lsl #8 //KEY_DOUBLE_UPDATE_4(r5) + str.w r5, [r1, #312] + str.w r4, [r1, #316] + pop {r2-r12,r14} + bx lr + +/***************************************************************************** +* Fully unrolled ARM assembly implementation of the GIFTb-128 block cipher. +* This function simply encrypts a 128-bit block, without any operation mode. +*****************************************************************************/ +@ void giftb128_encrypt_block(u8 *out, const u32* rkey, const u8 *block) +.global giftb128_encrypt_block +.type giftb128_encrypt_block,%function +giftb128_encrypt_block: + push {r2-r12,r14} + // load plaintext blocks + ldm r2, {r9-r12} + // endianness + rev r9, r9 + rev r10, r10 + rev r11, r11 + rev r12, r12 + // masks for HALF/BYTE/NIBBLE rotations + movw r2, #0x1111 + movt r2, #0x1111 //for NIBBLE_ROR + movw r3, #0x000f + movt r3, #0x000f //for HALF_ROR + mvn r4, r2, lsl #3 //0x7777777 for NIBBLE_ROR + // ------------------ 1st QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x0008 + movt r5, 0x1000 //load rconst + ldrd r6, r7, [r1] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + and r8, r4, r12, lsr #1 + and r12, r12, r2 + orr r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 2nd round + movw r5, 0x8000 + movt r5, 0x8001 //load rconst + ldrd r6, r7, [r1, #8] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r9, lsr #4 + and r9, r9, r3 + orr r9, r8, r9, lsl #12 //HALF_ROR(r9, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x5400 //load rconst + ldrd r6, r7, [r1, #16] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r12, r12, lsr #1 + and r8, r8, r14, lsr #16 + eor r12, r12, r8 + eor r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x55550000, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x00005555, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 4th round + movw r5, 0x0181 + movt r5, 0x0101 //load rconst + ldrd r6, r7, [r1, #24] //load rkey + and r8, r11, r12, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #16 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r9, lsr #6 + and r9, r14, r9 + orr r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 5th round + movw r5, 0x001f + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #32] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r9, r9, r5 //add rconst + + // ------------------ 2nd QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x8880 + movt r5, 0x1088 //load rconst + ldrd r6, r7, [r1, #40] //load rkey + and r8, r11, r12, ror #24 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #24 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + and r8, r4, r9, lsr #1 + and r9, r9, r2 + orr r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 2nd round + movw r5, 0xe000 + movt r5, 0x6001 //load rconst + ldrd r6, r7, [r1, #48] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r12, lsr #4 + and r12, r12, r3 + orr r12, r8, r12, lsl #12 //HALF_ROR(r12, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x5150 //load rconst + ldrd r6, r7, [r1, #56] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r9, r9, lsr #1 + and r8, r8, r14, lsr #16 + eor r9, r9, r8 + eor r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 4th round + movw r5, 0x0180 + movt r5, 0x0303 //load rconst + ldrd r6, r7, [r1, #64] //load rkey + and r8, r11, r9, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r8, r9, ror #16 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r12, lsr #6 + and r12, r14, r12 + orr r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 5th round + movw r5, 0x002f + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #72] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r12, r12, r5 //add rconst + + // ------------------ 3rd QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x8880 + movt r5, 0x1008 //load rconst + ldrd r6, r7, [r1, #80] //load rkey + and r8, r11, r9, ror #24 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r8, r9, ror #24 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + and r8, r4, r12, lsr #1 + and r12, r12, r2 + orr r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 2nd round + movw r5, 0x6000 + movt r5, 0x6001 //load rconst + ldrd r6, r7, [r1, #88] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r9, lsr #4 + and r9, r9, r3 + orr r9, r8, r9, lsl #12 //HALF_ROR(r9, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x4150 //load rconst + ldrd r6, r7, [r1, #96] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r12, r12, lsr #1 + and r8, r8, r14, lsr #16 + eor r12, r12, r8 + eor r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x00005555, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 4th round + movw r5, 0x0080 + movt r5, 0x0303 //load rconst + ldrd r6, r7, [r1, #104] //load rkey + and r8, r11, r12, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #16 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r9, lsr #6 + and r9, r14, r9 + orr r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 5th round + movw r5, 0x0027 + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #112] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r9, r9, r5 //add rconst + + // ------------------ 4th QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x8880 + movt r5, 0x1000 //load rconst + ldrd r6, r7, [r1, #120] //load rkey + and r8, r11, r12, ror #24 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #24 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + and r8, r4, r9, lsr #1 + and r9, r9, r2 + orr r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 2nd round + movw r5, 0xe000 + movt r5, 0x4001 //load rconst + ldrd r6, r7, [r1, #128] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r12, lsr #4 + and r12, r12, r3 + orr r12, r8, r12, lsl #12 //HALF_ROR(r12, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x1150 //load rconst + ldrd r6, r7, [r1, #136] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r9, r9, lsr #1 + and r8, r8, r14, lsr #16 + eor r9, r9, r8 + eor r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 4th round + movw r5, 0x0180 + movt r5, 0x0302 //load rconst + ldrd r6, r7, [r1, #144] //load rkey + and r8, r11, r9, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r8, r9, ror #16 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r12, lsr #6 + and r12, r14, r12 + orr r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 5th round + movw r5, 0x002b + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #152] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r12, r12, r5 //add rconst + + // ------------------ 5th QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x0880 + movt r5, 0x1008 //load rconst + ldrd r6, r7, [r1, #160] //load rkey + and r8, r11, r9, ror #24 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r8, r9, ror #24 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + and r8, r4, r12, lsr #1 + and r12, r12, r2 + orr r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 2nd round + movw r5, 0x4000 + movt r5, 0x6001 //load rconst + ldrd r6, r7, [r1, #168] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r9, lsr #4 + and r9, r9, r3 + orr r9, r8, r9, lsl #12 //HALF_ROR(r9, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x0140 //load rconst + ldrd r6, r7, [r1, #176] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r12, r12, lsr #1 + and r8, r8, r14, lsr #16 + eor r12, r12, r8 + eor r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x00005555, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 4th round + movw r5, 0x0080 + movt r5, 0x0202 //load rconst + ldrd r6, r7, [r1, #184] //load rkey + and r8, r11, r12, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #16 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r9, lsr #6 + and r9, r14, r9 + orr r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 5th round + movw r5, 0x0021 + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #192] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r9, r9, r5 //add rconst + + // ------------------ 6th QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x0080 + movt r5, 0x1000 //load rconst + ldrd r6, r7, [r1, #200] //load rkey + and r8, r11, r12, ror #24 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #24 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + and r8, r4, r9, lsr #1 + and r9, r9, r2 + orr r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 2nd round + movw r5, 0xc000 + movt r5, 0x0001 //load rconst + ldrd r6, r7, [r1, #208] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r12, lsr #4 + and r12, r12, r3 + orr r12, r8, r12, lsl #12 //HALF_ROR(r12, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x5100 //load rconst + ldrd r6, r7, [r1, #216] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r9, r9, lsr #1 + and r8, r8, r14, lsr #16 + eor r9, r9, r8 + eor r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 4th round + movw r5, 0x0180 + movt r5, 0x0301 //load rconst + ldrd r6, r7, [r1, #224] //load rkey + and r8, r11, r9, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r8, r9, ror #16 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r12, lsr #6 + and r12, r14, r12 + orr r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 5th round + movw r5, 0x002e + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #232] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r12, r12, r5 //add rconst + + + // ------------------ 7th QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x8800 + movt r5, 0x1008 //load rconst + ldrd r6, r7, [r1, #240] //load rkey + and r8, r11, r9, ror #24 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r8, r9, ror #24 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + and r8, r4, r12, lsr #1 + and r12, r12, r2 + orr r12, r8, r12, lsl #3 //NIBBLE_ROR(r12, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 2nd round + movw r5, 0x2000 + movt r5, 0x6001 //load rconst + ldrd r6, r7, [r1, #248] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r9, lsr #4 + and r9, r9, r3 + orr r9, r8, r9, lsl #12 //HALF_ROR(r9, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x4050 //load rconst + ldrd r6, r7, [r1, #256] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r12, r12, lsr #1 + and r8, r8, r14, lsr #16 + eor r12, r12, r8 + eor r12, r12, r8, lsl #1 //SWAPMOVE(r12, r12, 0x00005555, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 4th round + movw r5, 0x0080 + movt r5, 0x0103 //load rconst + ldrd r6, r7, [r1, #264] //load rkey + and r8, r11, r12, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #16 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r9, lsr #6 + and r9, r14, r9 + orr r9, r8, r9, lsl #2 //BYTE_ROR(r9, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 5th round + movw r5, 0x0006 + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #272] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r9, r9, r5 //add rconst + + // ------------------ 8th QUINTUPLE ROUND ------------------ + // 1st round + movw r5, 0x8808 + movt r5, 0x1000 //load rconst + ldrd r6, r7, [r1, #280] //load rkey + and r8, r11, r12, ror #24 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r8, r12, ror #24 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + and r8, r4, r9, lsr #1 + and r9, r9, r2 + orr r9, r8, r9, lsl #3 //NIBBLE_ROR(r9, 1) + and r8, r4, r11 + and r11, r2, r11, lsr #3 + orr r11, r11, r8, lsl #1 //NIBBLE_ROR(r11, 3) + orr r14, r2, r2, lsl #1 //0x33333333 for NIBBLE_ROR + and r8, r14, r10, lsr #2 + and r10, r10, r14 + orr r10, r8, r10, lsl #2 //NIBBLE_ROR(r10, 2) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 2nd round + movw r5, 0xa000 + movt r5, 0xc001 //load rconst + ldrd r6, r7, [r1, #288] //load rkey + and r8, r9, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r9, r8 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + mvn r14, r3, lsl #12 //0x0fff0fff for HALF_ROR + and r8, r14, r12, lsr #4 + and r12, r12, r3 + orr r12, r8, r12, lsl #12 //HALF_ROR(r12, 4) + and r8, r3, r11, lsr #12 + and r11, r11, r14 + orr r11, r8, r11, lsl #4 //HALF_ROR(r11, 12) + rev16 r10, r10 //HALF_ROR(r10, 8) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 3rd round + movw r5, 0x0002 + movt r5, 0x1450 //load rconst + ldrd r6, r7, [r1, #296] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9 + orr r14, r2, r2, lsl #2 //0x55555555 for SWAPMOVE + eor r8, r10, r10, lsr #1 + and r8, r8, r14 + eor r10, r10, r8 + eor r10, r10, r8, lsl #1 //SWAPMOVE(r10, r10, 0x55555555, 1) + eor r8, r9, r9, lsr #1 + and r8, r8, r14, lsr #16 + eor r9, r9, r8 + eor r9, r9, r8, lsl #1 //SWAPMOVE(r9, r9, 0x00005555, 1) + eor r8, r11, r11, lsr #1 + and r8, r8, r14, lsl #16 + eor r11, r11, r8 + eor r11, r11, r8, lsl #1 //SWAPMOVE(r11, r11, 0x55550000, 1) + eor r10, r10, r6 //add 1st keyword + eor r11, r7, r11, ror #16 //add 2nd keyword + eor r12, r12, r5 //add rconst + // 4th round + movw r5, 0x0181 + movt r5, 0x0102 //load rconst + ldrd r6, r7, [r1, #304] //load rkey + and r8, r11, r9, ror #16 //sbox layer + eor r10, r10, r8 + and r8, r10, r12 + eor r9, r8, r9, ror #16 + orr r8, r9, r10 + eor r11, r11, r8 + eor r12, r12, r11 + eor r10, r10, r12 + and r8, r9, r10 + eor r11, r11, r8 + mvn r12, r12 + eor r14, r3, r3, lsl #8 //0x0f0f0f0f for BYTE_ROR + and r8, r14, r10, lsr #4 + and r10, r10, r14 + orr r10, r8, r10, lsl #4 //BYTE_ROR(r10, 4) + orr r14, r14, r14, lsl #2 //0x3f3f3f3f for BYTE_ROR + mvn r8, r14 + and r8, r8, r11, lsl #6 + and r11, r14, r11, lsr #2 + orr r11, r11, r8 //BYTE_ROR(r11, 2) + mvn r8, r14, lsr #6 + and r8, r8, r12, lsr #6 + and r12, r14, r12 + orr r12, r8, r12, lsl #2 //BYTE_ROR(r12, 6) + eor r10, r10, r6 //add 1st keyword + eor r11, r11, r7 //add 2nd keyword + eor r9, r9, r5 //add rconst + // 5th round + movw r5, 0x001a + movt r5, 0x8000 //load rconst + ldrd r6, r7, [r1, #312] //load rkey + and r8, r12, r11 //sbox layer + eor r10, r10, r8 + and r8, r10, r9 + eor r12, r12, r8 + orr r8, r12, r10 + eor r11, r11, r8 + eor r9, r9, r11 + eor r10, r10, r9 + and r8, r12, r10 + eor r11, r11, r8 + mvn r9, r9, ror #24 + eor r10, r6, r10, ror #16 //add 1st keyword + eor r11, r7, r11, ror #8 //add 2nd keyword + eor r12, r12, r5 //add rconst + // endianness + rev r9, r9 + rev r10, r10 + rev r11, r11 + rev r12, r12 + stm r0, {r9-r12} + pop {r2-r12,r14} + bx lr diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/api.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/api.h new file mode 100644 index 0000000..fb1d58b --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/api.h @@ -0,0 +1,5 @@ +#define CRYPTO_KEYBYTES 16 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 16 +#define CRYPTO_ABYTES 16 +#define CRYPTO_NOOVERLAP 1 diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/cofb.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/cofb.h new file mode 100644 index 0000000..6a60e36 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/cofb.h @@ -0,0 +1,62 @@ +#ifndef COFB_H_ +#define COFB_H_ + +#define DOUBLE_HALF_BLOCK(x) ({ \ + tmp0 = (x)[0]; \ + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); \ + (x)[0] |= ((x)[1] & 0x80808080) << 17; \ + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); \ + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; \ +}) + +#define TRIPLE_HALF_BLOCK(x) ({ \ + tmp0 = (x)[0]; \ + tmp1 = (x)[1]; \ + (x)[0] = (((x)[0] & 0x7f7f7f7f) << 1) | (((x)[0] & 0x80808080) >> 15); \ + (x)[0] |= ((x)[1] & 0x80808080) << 17; \ + (x)[1] = (((x)[1] & 0x7f7f7f7f) << 1) | (((x)[1] & 0x80808080) >> 15); \ + (x)[1] ^= (((tmp0 >> 7) & 1) * 27) << 24; \ + (x)[0] ^= tmp0; \ + (x)[1] ^= tmp1; \ +}) + +#define G(x) ({ \ + tmp0 = (x)[0]; \ + tmp1 = (x)[1]; \ + (x)[0] = (x)[2]; \ + (x)[1] = (x)[3]; \ + (x)[2] = ((tmp0 & 0x7f7f7f7f) << 1) | ((tmp0 & 0x80808080) >> 15); \ + (x)[2] |= ((tmp1 & 0x80808080) << 17); \ + (x)[3] = ((tmp1 & 0x7f7f7f7f) << 1) | ((tmp1 & 0x80808080) >> 15); \ + (x)[3] |= ((tmp0 & 0x80808080) << 17); \ +}) + +#define XOR_BLOCK(x, y, z) ({ \ + (x)[0] = (y)[0] ^ (z)[0]; \ + (x)[1] = (y)[1] ^ (z)[1]; \ + (x)[2] = (y)[2] ^ (z)[2]; \ + (x)[3] = (y)[3] ^ (z)[3]; \ +}) + +#define XOR_TOP_BAR_BLOCK(x, y) ({ \ + (x)[0] ^= (y)[0]; \ + (x)[1] ^= (y)[1]; \ +}) + +#define RHO1(d, y, m, n) ({ \ + G(y); \ + padding(d,m,n); \ + XOR_BLOCK(d, d, y); \ +}) + +#define RHO(y, m, x, c, n) ({ \ + XOR_BLOCK(c, y, m); \ + RHO1(x, y, m, n); \ +}) + +#define RHO_PRIME(y, c, x, m, n) ({ \ + XOR_BLOCK(m, y, c); \ + RHO1(x, y, m, n); \ +}) + +#endif // COFB_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/encrypt.c b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/encrypt.c new file mode 100644 index 0000000..5f12a47 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/encrypt.c @@ -0,0 +1,163 @@ +/******************************************************************************* +* Constant-time 32-bit implementation of the GIFT-COFB authenticated cipher. +* +* @author Alexandre Adomnicai, Nanyang Technological University, +* alexandre.adomnicai@ntu.edu.sg +* @date January 2020 +*******************************************************************************/ +#include //for memcpy +#include "api.h" +#include "cofb.h" +#include "giftb128.h" + +#define TAGBYTES CRYPTO_ABYTES +#define BLOCKBYTES CRYPTO_ABYTES +#define COFB_ENCRYPT 1 +#define COFB_DECRYPT 0 + +/**************************************************************************** +* 32-bit padding implementation. +****************************************************************************/ +static inline void padding(u32* d, const u32* s, const u32 no_of_bytes){ + u32 i; + if (no_of_bytes == 0) { + d[0] = 0x00000080; // little-endian + d[1] = 0x00000000; + d[2] = 0x00000000; + d[3] = 0x00000000; + } + else if (no_of_bytes < BLOCKBYTES) { + for (i = 0; i < no_of_bytes/4+1; i++) + d[i] = s[i]; + d[i-1] &= ~(0xffffffffL << (no_of_bytes % 4)*8); + d[i-1] |= 0x00000080L << (no_of_bytes % 4)*8; + for (; i < 4; i++) + d[i] = 0x00000000; + } + else { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + } +} + +/**************************************************************************** +* Constant-time implementation of the GIFT-COFB authenticated cipher based on +* fixsliced GIFTb-128. Encryption/decryption is handled by the same function, +* depending on the 'encrypting' parameter (1/0). +****************************************************************************/ +int giftcofb_crypt(u8* out, const u8* key, const u8* nonce, const u8* ad, + u32 ad_len, const u8* in, u32 in_len, const int encrypting) { + + u32 tmp0, tmp1, emptyA, emptyM; + u32 offset[2], input[4], rkey[80]; + u8 Y[16]; + + if (!encrypting) { + if (in_len < TAGBYTES) + return -1; + in_len -= TAGBYTES; + } + + if (ad_len == 0) + emptyA = 1; + else + emptyA = 0; + + if (in_len == 0) + emptyM =1; + else + emptyM = 0; + + precompute_rkeys(rkey, key); + giftb128(Y, nonce, rkey); + offset[0] = ((u32*)Y)[0]; + offset[1] = ((u32*)Y)[1]; + + while (ad_len > BLOCKBYTES) { + RHO1(input, (u32*)Y, (u32*)ad, BLOCKBYTES); + DOUBLE_HALF_BLOCK(offset); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128(Y, (u8*)input, rkey); + ad += BLOCKBYTES; + ad_len -= BLOCKBYTES; + } + + TRIPLE_HALF_BLOCK(offset); + if ((ad_len % BLOCKBYTES != 0) || (emptyA)) + TRIPLE_HALF_BLOCK(offset); + if (emptyM) { + TRIPLE_HALF_BLOCK(offset); + TRIPLE_HALF_BLOCK(offset); + } + + RHO1(input, (u32*)Y, (u32*)ad, ad_len); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128(Y, (u8*)input, rkey); + + while (in_len > BLOCKBYTES) { + DOUBLE_HALF_BLOCK(offset); + if (encrypting) + RHO((u32*)Y, (u32*)in, input, (u32*)out, BLOCKBYTES); + else + RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, BLOCKBYTES); + XOR_TOP_BAR_BLOCK(input, offset); + giftb128(Y, (u8*)input, rkey); + in += BLOCKBYTES; + out += BLOCKBYTES; + in_len -= BLOCKBYTES; + } + + if (!emptyM) { + TRIPLE_HALF_BLOCK(offset); + if(in_len % BLOCKBYTES != 0) + TRIPLE_HALF_BLOCK(offset); + if (encrypting) { + RHO((u32*)Y, (u32*)in, input, (u32*)out, in_len); + out += in_len; + } + else { + RHO_PRIME((u32*)Y, (u32*)in, input, (u32*)out, in_len); + in += in_len; + } + XOR_TOP_BAR_BLOCK(input, offset); + giftb128(Y, (u8*)input, rkey); + } + + if (encrypting) { + memcpy(out, Y, TAGBYTES); + return 0; + } + // decrypting + tmp0 = 0; + for(tmp1 = 0; tmp1 < TAGBYTES; tmp1++) + tmp0 |= in[tmp1] ^ Y[tmp1]; + return tmp0; +} + +/**************************************************************************** +* API required by the NIST for the LWC competition. +****************************************************************************/ +int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen, + const unsigned char* m, unsigned long long mlen, + const unsigned char* ad, unsigned long long adlen, + const unsigned char* nsec, const unsigned char* npub, + const unsigned char* k) { + (void)nsec; + *clen = mlen + TAGBYTES; + return giftcofb_crypt(c, k, npub, ad, adlen, m, mlen, COFB_ENCRYPT); +} + +/**************************************************************************** +* API required by the NIST for the LWC competition. +****************************************************************************/ +int crypto_aead_decrypt(unsigned char* m, unsigned long long *mlen, + unsigned char* nsec, const unsigned char* c, + unsigned long long clen, const unsigned char* ad, + unsigned long long adlen, const unsigned char* npub, + const unsigned char *k) { + (void)nsec; + *mlen = clen - TAGBYTES; + return giftcofb_crypt(m, k, npub, ad, adlen, c, clen, COFB_DECRYPT); +} diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/endian.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/endian.h new file mode 100644 index 0000000..4b3879a --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/endian.h @@ -0,0 +1,14 @@ +#ifndef ENDIAN_H_ +#define ENDIAN_H_ + +#define U32BIG(x) \ + ((((x) & 0x000000FF) << 24) | (((x) & 0x0000FF00) << 8) | \ + (((x) & 0x00FF0000) >> 8) | (((x) & 0xFF000000) >> 24)) + +#define U8BIG(x, y) \ + (x)[0] = (y) >> 24; \ + (x)[1] = ((y) >> 16) & 0xff; \ + (x)[2] = ((y) >> 8) & 0xff; \ + (x)[3] = (y) & 0xff; + +#endif // ENDIAN_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.c b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.c new file mode 100644 index 0000000..a1e31ed --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.c @@ -0,0 +1,96 @@ +/******************************************************************************* +* Optimized constant-time implementation of the GIFTb-128 block cipher. +* +* @author Alexandre Adomnicai, Nanyang Technological University, +* alexandre.adomnicai@ntu.edu.sg +* +* @date January 2020 +*******************************************************************************/ +#include "endian.h" +#include "giftb128.h" +#include "key_schedule.h" + +/***************************************************************************** +* The round constants according to the fixsliced representation. +*****************************************************************************/ +const u32 rconst[40] = { + 0x10000008, 0x80018000, 0x54000002, 0x01010181, + 0x8000001f, 0x10888880, 0x6001e000, 0x51500002, + 0x03030180, 0x8000002f, 0x10088880, 0x60016000, + 0x41500002, 0x03030080, 0x80000027, 0x10008880, + 0x4001e000, 0x11500002, 0x03020180, 0x8000002b, + 0x10080880, 0x60014000, 0x01400002, 0x02020080, + 0x80000021, 0x10000080, 0x0001c000, 0x51000002, + 0x03010180, 0x8000002e, 0x10088800, 0x60012000, + 0x40500002, 0x01030080, 0x80000006, 0x10008808, + 0xc001a000, 0x14500002, 0x01020181, 0x8000001a +}; + +/***************************************************************************** +* The first 20 rkeys are computed using the classical representation before +* being rearranged into fixsliced representations depending on round numbers. +* The 60 remaining rkeys are directly computed in fixscliced representations. +*****************************************************************************/ +void precompute_rkeys(u32* rkey, const u8* key) { + u32 tmp; + //classical initialization + rkey[0] = U32BIG(((u32*)key)[3]); + rkey[1] = U32BIG(((u32*)key)[1]); + rkey[2] = U32BIG(((u32*)key)[2]); + rkey[3] = U32BIG(((u32*)key)[0]); + // classical keyschedule + for(int i = 0; i < 16; i+=2) { + rkey[i+4] = rkey[i+1]; + rkey[i+5] = KEY_UPDATE(rkey[i]); + } + // transposition to fixsliced representations + for(int i = 0; i < 20; i+=10) { + rkey[i] = REARRANGE_RKEY_0(rkey[i]); + rkey[i + 1] = REARRANGE_RKEY_0(rkey[i + 1]); + rkey[i + 2] = REARRANGE_RKEY_1(rkey[i + 2]); + rkey[i + 3] = REARRANGE_RKEY_1(rkey[i + 3]); + rkey[i + 4] = REARRANGE_RKEY_2(rkey[i + 4]); + rkey[i + 5] = REARRANGE_RKEY_2(rkey[i + 5]); + rkey[i + 6] = REARRANGE_RKEY_3(rkey[i + 6]); + rkey[i + 7] = REARRANGE_RKEY_3(rkey[i + 7]); + } + // keyschedule according to fixsliced representations + for(int i = 20; i < 80; i+=10) { + rkey[i] = rkey[i-19]; + rkey[i+1] = KEY_TRIPLE_UPDATE_0(rkey[i-20]); + rkey[i+2] = KEY_DOUBLE_UPDATE_1(rkey[i-17]); + rkey[i+3] = KEY_TRIPLE_UPDATE_1(rkey[i-18]); + rkey[i+4] = KEY_DOUBLE_UPDATE_2(rkey[i-15]); + rkey[i+5] = KEY_TRIPLE_UPDATE_2(rkey[i-16]); + rkey[i+6] = KEY_DOUBLE_UPDATE_3(rkey[i-13]); + rkey[i+7] = KEY_TRIPLE_UPDATE_3(rkey[i-14]); + rkey[i+8] = KEY_DOUBLE_UPDATE_4(rkey[i-11]); + rkey[i+9] = KEY_TRIPLE_UPDATE_4(rkey[i-12]); + SWAPMOVE(rkey[i], rkey[i], 0x00003333, 16); + SWAPMOVE(rkey[i], rkey[i], 0x55554444, 1); + SWAPMOVE(rkey[i+1], rkey[i+1], 0x55551100, 1); + } +} + +/***************************************************************************** +* Encryption of a single 128-bit block with GIFTb-128 (used in GIFT-COFB). +*****************************************************************************/ +void giftb128(u8* ctext, const u8* ptext, const u32* rkey) { + u32 tmp, state[4]; + state[0] = U32BIG(((u32*)ptext)[0]); + state[1] = U32BIG(((u32*)ptext)[1]); + state[2] = U32BIG(((u32*)ptext)[2]); + state[3] = U32BIG(((u32*)ptext)[3]); + QUINTUPLE_ROUND(state, rkey, rconst); + QUINTUPLE_ROUND(state, rkey + 10, rconst + 5); + QUINTUPLE_ROUND(state, rkey + 20, rconst + 10); + QUINTUPLE_ROUND(state, rkey + 30, rconst + 15); + QUINTUPLE_ROUND(state, rkey + 40, rconst + 20); + QUINTUPLE_ROUND(state, rkey + 50, rconst + 25); + QUINTUPLE_ROUND(state, rkey + 60, rconst + 30); + QUINTUPLE_ROUND(state, rkey + 70, rconst + 35); + U8BIG(ctext, state[0]); + U8BIG(ctext + 4, state[1]); + U8BIG(ctext + 8, state[2]); + U8BIG(ctext + 12, state[3]); +} diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.h new file mode 100644 index 0000000..e1b8d10 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/giftb128.h @@ -0,0 +1,88 @@ +#ifndef GIFT128_H_ +#define GIFT128_H_ + +typedef unsigned char u8; +typedef unsigned int u32; + +extern void precompute_rkeys(u32* rkeys, const u8* key); +extern void giftb128(u8* out, const u8* in, const u32* rkeys); + +#define ROR(x,y) \ + (((x) >> (y)) | ((x) << (32 - (y)))) +#define BYTE_ROR_2(x) \ + ((((x) >> 2) & 0x3f3f3f3f) | (((x) & 0x03030303) << 6)) +#define BYTE_ROR_4(x) \ + ((((x) >> 4) & 0x0f0f0f0f) | (((x) & 0x0f0f0f0f) << 4)) +#define BYTE_ROR_6(x) \ + ((((x) >> 6) & 0x03030303) | (((x) & 0x3f3f3f3f) << 2)) +#define HALF_ROR_4(x) \ + ((((x) >> 4) & 0x0fff0fff) | (((x) & 0x000f000f) << 12)) +#define HALF_ROR_8(x) \ + ((((x) >> 8) & 0x00ff00ff) | (((x) & 0x00ff00ff) << 8)) +#define HALF_ROR_12(x) \ + ((((x) >> 12)& 0x000f000f) | (((x) & 0x0fff0fff) << 4)) +#define NIBBLE_ROR_1(x) \ + ((((x) >> 1) & 0x77777777) | (((x) & 0x11111111) << 3)) +#define NIBBLE_ROR_2(x) \ + ((((x) >> 2) & 0x33333333) | (((x) & 0x33333333) << 2)) +#define NIBBLE_ROR_3(x) \ + ((((x) >> 3) & 0x11111111) | (((x) & 0x77777777) << 1)) + +#define SWAPMOVE(a, b, mask, n) \ + tmp = (b ^ (a >> n)) & mask; \ + b ^= tmp; \ + a ^= (tmp << n); + +#define SBOX(s0, s1, s2, s3) \ + s1 ^= s0 & s2; \ + s0 ^= s1 & s3; \ + s2 ^= s0 | s1; \ + s3 ^= s2; \ + s1 ^= s3; \ + s3 ^= 0xffffffff; \ + s2 ^= s0 & s1; + +#define QUINTUPLE_ROUND(state, rkey, rconst) ({ \ + SBOX(state[0], state[1], state[2], state[3]); \ + state[3] = NIBBLE_ROR_1(state[3]); \ + state[1] = NIBBLE_ROR_2(state[1]); \ + state[2] = NIBBLE_ROR_3(state[2]); \ + state[1] ^= (rkey)[0]; \ + state[2] ^= (rkey)[1]; \ + state[0] ^= (rconst)[0]; \ + SBOX(state[3], state[1], state[2], state[0]); \ + state[0] = HALF_ROR_4(state[0]); \ + state[1] = HALF_ROR_8(state[1]); \ + state[2] = HALF_ROR_12(state[2]); \ + state[1] ^= (rkey)[2]; \ + state[2] ^= (rkey)[3]; \ + state[3] ^= (rconst)[1]; \ + SBOX(state[0], state[1], state[2], state[3]); \ + state[3] = ROR(state[3], 16); \ + state[2] = ROR(state[2], 16); \ + SWAPMOVE(state[1], state[1], 0x55555555, 1); \ + SWAPMOVE(state[2], state[2], 0x00005555, 1); \ + SWAPMOVE(state[3], state[3], 0x55550000, 1); \ + state[1] ^= (rkey)[4]; \ + state[2] ^= (rkey)[5]; \ + state[0] ^= (rconst)[2]; \ + SBOX(state[3], state[1], state[2], state[0]); \ + state[0] = BYTE_ROR_6(state[0]); \ + state[1] = BYTE_ROR_4(state[1]); \ + state[2] = BYTE_ROR_2(state[2]); \ + state[1] ^= (rkey)[6]; \ + state[2] ^= (rkey)[7]; \ + state[3] ^= (rconst)[3]; \ + SBOX(state[0], state[1], state[2], state[3]); \ + state[3] = ROR(state[3], 24); \ + state[1] = ROR(state[1], 16); \ + state[2] = ROR(state[2], 8); \ + state[1] ^= (rkey)[8]; \ + state[2] ^= (rkey)[9]; \ + state[0] ^= (rconst)[4]; \ + state[0] ^= state[3]; \ + state[3] ^= state[0]; \ + state[0] ^= state[3]; \ +}) + +#endif // GIFT128_H_ \ No newline at end of file diff --git a/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/key_schedule.h b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/key_schedule.h new file mode 100644 index 0000000..3965a44 --- /dev/null +++ b/gift-cofb/Implementations/crypto_aead/giftcofb128v1/opt32/key_schedule.h @@ -0,0 +1,70 @@ +#ifndef KEYSCHEDULE_H_ +#define KEYSCHEDULE_H_ + +#define REARRANGE_RKEY_0(x) ({ \ + SWAPMOVE(x, x, 0x00550055, 9); \ + SWAPMOVE(x, x, 0x000f000f, 12); \ + SWAPMOVE(x, x, 0x00003333, 18); \ + SWAPMOVE(x, x, 0x000000ff, 24); \ +}) + +#define REARRANGE_RKEY_1(x) ({ \ + SWAPMOVE(x, x, 0x11111111, 3); \ + SWAPMOVE(x, x, 0x03030303, 6); \ + SWAPMOVE(x, x, 0x000f000f, 12); \ + SWAPMOVE(x, x, 0x000000ff, 24); \ +}) + +#define REARRANGE_RKEY_2(x) ({ \ + SWAPMOVE(x, x, 0x0000aaaa, 15); \ + SWAPMOVE(x, x, 0x00003333, 18); \ + SWAPMOVE(x, x, 0x0000f0f0, 12); \ + SWAPMOVE(x, x, 0x000000ff, 24); \ +}) + +#define REARRANGE_RKEY_3(x) ({ \ + SWAPMOVE(x, x, 0x0a0a0a0a, 3); \ + SWAPMOVE(x, x, 0x00cc00cc, 6); \ + SWAPMOVE(x, x, 0x0000f0f0, 12); \ + SWAPMOVE(x, x, 0x000000ff, 24); \ +}) + +#define KEY_UPDATE(x) \ + (((x) >> 12) & 0x0000000f) | (((x) & 0x00000fff) << 4) | \ + (((x) >> 2) & 0x3fff0000) | (((x) & 0x00030000) << 14) + +#define KEY_TRIPLE_UPDATE_0(x) \ + (ROR((x) & 0x33333333, 24) | ROR((x) & 0xcccccccc, 16)) + +#define KEY_DOUBLE_UPDATE_1(x) \ + ((((x) >> 4) & 0x0f000f00) | (((x) & 0x0f000f00) << 4) | \ + (((x) >> 6) & 0x00030003) | (((x) & 0x003f003f) << 2)) + +#define KEY_TRIPLE_UPDATE_1(x) \ + ((((x) >> 6) & 0x03000300) | (((x) & 0x3f003f00) << 2) | \ + (((x) >> 5) & 0x00070007) | (((x) & 0x001f001f) << 3)) + +#define KEY_DOUBLE_UPDATE_2(x) \ + (ROR((x) & 0xaaaaaaaa, 24) | ROR((x) & 0x55555555, 16)) + +#define KEY_TRIPLE_UPDATE_2(x) \ + (ROR((x) & 0x55555555, 24) | ROR((x) & 0xaaaaaaaa, 20)) + +#define KEY_DOUBLE_UPDATE_3(x) \ + ((((x) >> 2) & 0x03030303) | (((x) & 0x03030303) << 2) | \ + (((x) >> 1) & 0x70707070) | (((x) & 0x10101010) << 3)) + +#define KEY_TRIPLE_UPDATE_3(x) \ + ((((x) >> 18) & 0x00003030) | (((x) & 0x01010101) << 3) | \ + (((x) >> 14) & 0x0000c0c0) | (((x) & 0x0000e0e0) << 15)| \ + (((x) >> 1) & 0x07070707) | (((x) & 0x00001010) << 19)) + +#define KEY_DOUBLE_UPDATE_4(x) \ + ((((x) >> 4) & 0x0fff0000) | (((x) & 0x000f0000) << 12) | \ + (((x) >> 8) & 0x000000ff) | (((x) & 0x000000ff) << 8)) + +#define KEY_TRIPLE_UPDATE_4(x) \ + ((((x) >> 6) & 0x03ff0000) | (((x) & 0x003f0000) << 10) | \ + (((x) >> 4) & 0x00000fff) | (((x) & 0x0000000f) << 12)) + +#endif // KEYSCHEDULE_H_ \ No newline at end of file diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/api.h b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/api.h new file mode 100644 index 0000000..a4aa567 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/api.h @@ -0,0 +1,5 @@ +#define CRYPTO_KEYBYTES 16 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 16 +#define CRYPTO_ABYTES 16 +#define CRYPTO_NOOVERLAP 1 diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/crypto_aead.h b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/crypto_aead.h new file mode 100644 index 0000000..cfc09d6 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/crypto_aead.h @@ -0,0 +1,11 @@ +int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *ad, unsigned long long adlen, + const unsigned char *nsec, const unsigned char *npub, + const unsigned char *k); + +int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen, + unsigned char *nsec, + const unsigned char *c, unsigned long long clen, + const unsigned char *ad, unsigned long long adlen, + const unsigned char *npub, const unsigned char *k); diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/encrypt.c b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/encrypt.c new file mode 100644 index 0000000..91d0e6e --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/encrypt.c @@ -0,0 +1,738 @@ +/* + * Date: 29 November 2018 + * Contact: Thomas Peyrin - thomas.peyrin@gmail.com + * Mustafa Khairallah - mustafam001@e.ntu.edu.sg + */ + +#include "crypto_aead.h" +#include "api.h" +#include "skinny.h" +#include +#include + +void pad (const unsigned char* m, unsigned char* mp, int l, int len8) { + int i; + + for (i = 0; i < l; i++) { + if (i < len8) { + mp[i] = m[i]; + } + else if (i == l - 1) { + mp[i] = (len8 & 0x0f); + } + else { + mp[i] = 0x00; + } + } + +} + +void g8A (unsigned char* s, unsigned char* c) { + unsigned int tmps[4]; + unsigned int tmpc[4]; + + tmps[0] = *((unsigned int *)&s[0]); + tmps[1] = *((unsigned int *)&s[4]); + tmps[2] = *((unsigned int *)&s[8]); + tmps[3] = *((unsigned int *)&s[12]); + + // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7); + // + // (s[i] >> 1) -> ((s[i]>>1)&0x7f) + // (s[i] & 0x80) -> (s[i])&0x80) not changed + // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80) + + // use word access because of speeding up + tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080); + tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080); + tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080); + tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080); + + *((unsigned int *)&c[0]) = tmpc[0]; + *((unsigned int *)&c[4]) = tmpc[1]; + *((unsigned int *)&c[8]) = tmpc[2]; + *((unsigned int *)&c[12]) = tmpc[3]; +} + +void g8A_for_Tag_Generation (unsigned char* s, unsigned char* c) { + unsigned int tmps[4]; + unsigned int tmpc[4]; + + tmps[0] = *((unsigned int *)&s[0]); + tmps[1] = *((unsigned int *)&s[4]); + tmps[2] = *((unsigned int *)&s[8]); + tmps[3] = *((unsigned int *)&s[12]); + + // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7); + // + // (s[i] >> 1) -> ((s[i]>>1)&0x7f) + // (s[i] & 0x80) -> (s[i])&0x80) not changed + // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80) + + // use word access because of speeding up + tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080); + tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080); + tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080); + tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080); + + // use byte access because of memory alignment. + // c is not always in word(4 byte) alignment. + c[0] = tmpc[0] &0xFF; + c[1] = (tmpc[0]>>8) &0xFF; + c[2] = (tmpc[0]>>16)&0xFF; + c[3] = (tmpc[0]>>24)&0xFF; + c[4] = tmpc[1] &0xFF; + c[5] = (tmpc[1]>>8) &0xFF; + c[6] = (tmpc[1]>>16)&0xFF; + c[7] = (tmpc[1]>>24)&0xFF; + c[8] = tmpc[2] &0xFF; + c[9] = (tmpc[2]>>8) &0xFF; + c[10] = (tmpc[2]>>16)&0xFF; + c[11] = (tmpc[2]>>24)&0xFF; + c[12] = tmpc[3] &0xFF; + c[13] = (tmpc[3]>>8) &0xFF; + c[14] = (tmpc[3]>>16)&0xFF; + c[15] = (tmpc[3]>>24)&0xFF; +} + +void rho_ad_eqov16 (const unsigned char* m, + unsigned char* s) { + *((unsigned int *)&s[0]) ^= *((unsigned int *)&m[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&m[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&m[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]); +} + +void rho_ad_ud16 (const unsigned char* m, + unsigned char* s, + int len8) { + unsigned char mp [16]; + + pad(m,mp,16,len8); + *((unsigned int *)&s[0]) ^= *((unsigned int *)&mp[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&mp[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&mp[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]); +} + +void rho_eqov16 (const unsigned char* m, + unsigned char* c, + unsigned char* s) { + g8A(s,c); + + *((unsigned int *)&s[0]) ^= *((unsigned int *)&m[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&m[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&m[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]); + + *((unsigned int *)&c[0]) ^= *((unsigned int *)&m[0]); + *((unsigned int *)&c[4]) ^= *((unsigned int *)&m[4]); + *((unsigned int *)&c[8]) ^= *((unsigned int *)&m[8]); + *((unsigned int *)&c[12]) ^= *((unsigned int *)&m[12]); +} + +void rho_ud16 (const unsigned char* m, + unsigned char* c, + unsigned char* s, + int len8, + int ver) { + int i; + unsigned char mp [16]; + + pad(m,mp,ver,len8); + + g8A(s,c); + *((unsigned int *)&s[0]) ^= *((unsigned int *)&mp[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&mp[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&mp[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]); + for (i = 0; i < ver; i++) { + if (i < len8) { + c[i] = c[i] ^ mp[i]; + } + else { + c[i] = 0; + } + } +} + +void irho (unsigned char* m, + const unsigned char* c, + unsigned char* s, + int len8, + int ver) { + int i; + unsigned char cp [16]; + + pad(c,cp,ver,len8); + + g8A(s,m); + for (i = 0; i < ver; i++) { + if (i < len8) { + s[i] = s[i] ^ cp[i] ^ m[i]; + } + else { + s[i] = s[i] ^ cp[i]; + } + if (i < len8) { + m[i] = m[i] ^ cp[i]; + } + else { + m[i] = 0; + } + } +} + +void reset_lfsr_gf56 (unsigned char* CNT) { + *((unsigned int *)&CNT[0]) = 0x00000001; + *((unsigned int *)&CNT[4]) = 0x00000000; +} + +void lfsr_gf56 (unsigned char* CNT) { + unsigned int tmpCNT[2]; + unsigned int fb0; + + tmpCNT[0] = *((unsigned int *)&CNT[0]); // CNT3 CNT2 CNT1 CNT0 + tmpCNT[1] = *((unsigned int *)&CNT[4]); // CNT7 CNT6 CNT5 CNT4 + + fb0 = 0; + if ((tmpCNT[1] >> 23)&0x01) { + fb0 = 0x95; + } + + tmpCNT[1] = tmpCNT[1] << 1 | tmpCNT[0] >> 31; + tmpCNT[0] = tmpCNT[0] << 1 ^ fb0; + + *((unsigned int *)&CNT[0]) = tmpCNT[0]; + *((unsigned int *)&CNT[4]) = tmpCNT[1]; +} + +void block_cipher(unsigned char* s, + const unsigned char* k, unsigned char* T, + unsigned char* CNT, + skinny_ctrl* p_skinny_ctrl) { + p_skinny_ctrl->func_skinny_128_384_enc (s,p_skinny_ctrl,CNT,T,k); +} + +void nonce_encryption (const unsigned char* N, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + skinny_ctrl* p_skinny_ctrl) { + unsigned char T [16]; + *((unsigned int *)&T[0]) = *((unsigned int *)&N[0]); + *((unsigned int *)&T[4]) = *((unsigned int *)&N[4]); + *((unsigned int *)&T[8]) = *((unsigned int *)&N[8]); + *((unsigned int *)&T[12]) = *((unsigned int *)&N[12]); + CNT[7] = D; + block_cipher(s,k,T,CNT,p_skinny_ctrl); + +} + +void generate_tag (unsigned char** c, unsigned char* s, + int n, unsigned long long* clen) { + + g8A_for_Tag_Generation(s, *c); + *c = *c + n; + *c = *c - *clen; + +} + +unsigned long long msg_encryption (const unsigned char** M, unsigned char** c, + const unsigned char* N, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + unsigned long long mlen, + skinny_ctrl* p_skinny_ctrl) { + int len8; + + if (mlen >= 16) { + len8 = 16; + mlen = mlen - 16; + rho_eqov16(*M, *c, s); + } + else { + len8 = mlen; + mlen = 0; + rho_ud16(*M, *c, s, len8, 16); + } + *c = *c + len8; + *M = *M + len8; + lfsr_gf56(CNT); + if (mlen != 0) { + nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl); + } + return mlen; +} + + + +unsigned long long msg_decryption (unsigned char** M, const unsigned char** c, + const unsigned char* N, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + unsigned long long clen, + skinny_ctrl* p_skinny_ctrl) { + int len8; + + if (clen >= 16) { + len8 = 16; + clen = clen - 16; + } + else { + len8 = clen; + clen = 0; + } + irho(*M, *c, s, len8, 16); + *c = *c + len8; + *M = *M + len8; + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl); + return clen; +} + +unsigned long long ad2msg_encryption (const unsigned char** M, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + unsigned long long mlen, + skinny_ctrl* p_skinny_ctrl) { + unsigned char T [16]; + int len8; + + if (mlen <= 16) { + len8 = mlen; + mlen = 0; + + pad (*M,T,16,len8); + } + else { + len8 = 16; + mlen = mlen - 16; + + unsigned char *pM = (unsigned char *)(*M); + *((unsigned int *)&T[0]) = *((unsigned int *)&pM[0]); + *((unsigned int *)&T[4]) = *((unsigned int *)&pM[4]); + *((unsigned int *)&T[8]) = *((unsigned int *)&pM[8]); + *((unsigned int *)&T[12]) = *((unsigned int *)&pM[12]); + } + + CNT[7] = D; + block_cipher(s,k,T,CNT,p_skinny_ctrl); + lfsr_gf56(CNT); + *M = *M + len8; + + return mlen; + +} + + +unsigned long long ad_encryption (const unsigned char** A, unsigned char* s, + const unsigned char* k, unsigned long long adlen, + unsigned char* CNT, + unsigned char D, + skinny_ctrl* p_skinny_ctrl) { + + unsigned char T [16]; + int len8; + + if (adlen >= 16) { + len8 = 16; + adlen = adlen - 16; + rho_ad_eqov16(*A, s); + } + else { + len8 = adlen; + adlen = 0; + rho_ad_ud16(*A, s, len8); + } + *A = *A + len8; + lfsr_gf56(CNT); + if (adlen != 0) { + if (adlen >= 16) { + len8 = 16; + adlen = adlen - 16; + + unsigned char *pA = (unsigned char *)(*A); + *((unsigned int *)&T[0]) = *((unsigned int *)&pA[0]); + *((unsigned int *)&T[4]) = *((unsigned int *)&pA[4]); + *((unsigned int *)&T[8]) = *((unsigned int *)&pA[8]); + *((unsigned int *)&T[12]) = *((unsigned int *)&pA[12]); + } + else { + len8 = adlen; + adlen = 0; + + pad(*A, T, 16, len8); + } + *A = *A + len8; + CNT[7] = D; + block_cipher(s,k,T,CNT,p_skinny_ctrl); + lfsr_gf56(CNT); + } + + return adlen; +} + +int crypto_aead_encrypt ( + unsigned char* c, unsigned long long* clen, + const unsigned char* m, unsigned long long mlen, + const unsigned char* ad, unsigned long long adlen, + const unsigned char* nsec, + const unsigned char* npub, + const unsigned char* k + ) +{ + unsigned char s[16]; + unsigned char CNT[8]; // size 7 -> 8 for word access + unsigned char T[16]; + const unsigned char* N; + unsigned char w; + unsigned long long xlen; + + skinny_ctrl l_skinny_ctrl; + l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12; + + (void)nsec; + N = npub; + + xlen = mlen; + + *((unsigned int *)&s[0]) = 0x00000000; + *((unsigned int *)&s[4]) = 0x00000000; + *((unsigned int *)&s[8]) = 0x00000000; + *((unsigned int *)&s[12]) = 0x00000000; + reset_lfsr_gf56(CNT); + + w = 48; + + if (adlen == 0) { + w = w ^ 2; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else if (adlen%(32) == 0) { + w = w ^ 8; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else if (adlen%(32) < 16) { + w = w ^ 2; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else if (adlen%(32) == 16) { + w = w ^ 0; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else { + w = w ^ 10; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + + if (adlen == 0) { // AD is an empty string + lfsr_gf56(CNT); + } + else while (adlen > 0) { + adlen = ad_encryption(&ad,s,k,adlen,CNT,40,&l_skinny_ctrl); + } + + if ((w & 8) == 0) { + xlen = ad2msg_encryption (&m,CNT,s,k,44,xlen,&l_skinny_ctrl); + } + else if (mlen == 0) { + lfsr_gf56(CNT); + } + while (xlen > 0) { + xlen = ad_encryption(&m,s,k,xlen,CNT,44,&l_skinny_ctrl); + } + nonce_encryption(N,CNT,s,k,w,&l_skinny_ctrl); + + // because, nonce_encryption is called at the last block of AD encryption + l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc1_1; + + // Tag generation + g8A(s, T); + + m = m - mlen; + + reset_lfsr_gf56(CNT); + + *((unsigned int *)&s[0]) = *((unsigned int *)&T[0]); + *((unsigned int *)&s[4]) = *((unsigned int *)&T[4]); + *((unsigned int *)&s[8]) = *((unsigned int *)&T[8]); + *((unsigned int *)&s[12]) = *((unsigned int *)&T[12]); + + *clen = mlen + 16; + + if (mlen > 0) { + nonce_encryption(N,CNT,s,k,36,&l_skinny_ctrl); + while (mlen > 16) { + mlen = msg_encryption(&m,&c,N,CNT,s,k,36,mlen,&l_skinny_ctrl); + } + rho_ud16(m, c, s, mlen, 16); + c = c + mlen; + m = m + mlen; + } + + // Tag Concatenation + + // use byte access because of memory alignment. + // c is not always in word(4 byte) alignment. + + for (int i = 0; i < 16; i = i + 1) { + *(c + i) = T[i]; + } + + c = c - *clen; + + return 0; +} + +int crypto_aead_decrypt( +unsigned char *m,unsigned long long *mlen, +unsigned char *nsec, +const unsigned char *c,unsigned long long clen, +const unsigned char *ad,unsigned long long adlen, +const unsigned char *npub, +const unsigned char *k +) +{ + unsigned char s[16]; + unsigned char CNT[8]; // size 7 -> 8 for word access + unsigned char T[16]; + const unsigned char* N; + unsigned char w; + unsigned long long xlen; + const unsigned char* mauth; + + skinny_ctrl l_skinny_ctrl; + l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12; + + (void)nsec; + mauth = m; + + N = npub; + + xlen = clen-16; + + reset_lfsr_gf56(CNT); + + for (int i = 0; i < 16; i++) { + T[i] = *(c + clen - 16 + i); + } + + *((unsigned int *)&s[0]) = *((unsigned int *)&T[0]); + *((unsigned int *)&s[4]) = *((unsigned int *)&T[4]); + *((unsigned int *)&s[8]) = *((unsigned int *)&T[8]); + *((unsigned int *)&s[12]) = *((unsigned int *)&T[12]); + + clen = clen - 16; + *mlen = clen; + + if (clen > 0) { + nonce_encryption(N,CNT,s,k,36,&l_skinny_ctrl); + while (clen > 16) { + clen = msg_decryption(&m,&c,N,CNT,s,k,36,clen,&l_skinny_ctrl); + } + irho(m, c, s, clen, 16); + c = c + clen; + m = m + clen; + } + + *((unsigned int *)&s[0]) = 0x00000000; + *((unsigned int *)&s[4]) = 0x00000000; + *((unsigned int *)&s[8]) = 0x00000000; + *((unsigned int *)&s[12]) = 0x00000000; + reset_lfsr_gf56(CNT); + + w = 48; + + if (adlen == 0) { + w = w ^ 2; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else if (adlen%(32) == 0) { + w = w ^ 8; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else if (adlen%(32) < 16) { + w = w ^ 2; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else if (adlen%(32) == 16) { + w = w ^ 0; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + else { + w = w ^ 10; + if (xlen == 0) { + w =w ^ 1; + } + else if (xlen%(32) == 0) { + w = w ^ 4; + } + else if (xlen%(32) < 16) { + w = w ^ 1; + } + else if (xlen%(32) == 16) { + w = w ^ 0; + } + else { + w = w ^ 5; + } + } + + if (adlen == 0) { // AD is an empty string + lfsr_gf56(CNT); + } + else while (adlen > 0) { + adlen = ad_encryption(&ad,s,k,adlen,CNT,40,&l_skinny_ctrl); + } + + if ((w & 8) == 0) { + xlen = ad2msg_encryption (&mauth,CNT,s,k,44,xlen,&l_skinny_ctrl); + } + else if (clen == 0) { + lfsr_gf56(CNT); + } + while (xlen > 0) { + xlen = ad_encryption(&mauth,s,k,xlen,CNT,44,&l_skinny_ctrl); + } + nonce_encryption(N,CNT,s,k,w,&l_skinny_ctrl); + + // Tag generation + g8A_for_Tag_Generation(s, T); + for (int i = 0; i < 16; i++) { + if (T[i] != (*(c+i))) { + return -1; + } + } + + return 0; +} diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/genkat_aead.c b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/genkat_aead.c new file mode 100644 index 0000000..21f840f --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/genkat_aead.c @@ -0,0 +1,161 @@ +// +// NIST-developed software is provided by NIST as a public service. +// You may use, copy and distribute copies of the software in any medium, +// provided that you keep intact this entire notice. You may improve, +// modify and create derivative works of the software or any portion of +// the software, and you may copy and distribute such modifications or +// works. Modified works should carry a notice stating that you changed +// the software and should note the date and nature of any such change. +// Please explicitly acknowledge the National Institute of Standards and +// Technology as the source of the software. +// +// NIST-developed software is expressly provided "AS IS." NIST MAKES NO +// WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT OR ARISING BY OPERATION +// OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT AND DATA ACCURACY. NIST +// NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE +// UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST +// DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE +// OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, +// RELIABILITY, OR USEFULNESS OF THE SOFTWARE. +// +// You are solely responsible for determining the appropriateness of using and +// distributing the software and you assume all risks associated with its use, +// including but not limited to the risks and costs of program errors, compliance +// with applicable laws, damage to or loss of data, programs or equipment, and +// the unavailability or interruption of operation. This software is not intended +// to be used in any situation where a failure could cause risk of injury or +// damage to property. The software developed by NIST employees is not subject to +// copyright protection within the United States. +// + +// disable deprecation for sprintf and fopen +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#endif + +#include +#include + +#include "crypto_aead.h" +#include "api.h" + +#define KAT_SUCCESS 0 +#define KAT_FILE_OPEN_ERROR -1 +#define KAT_DATA_ERROR -3 +#define KAT_CRYPTO_FAILURE -4 + +#define MAX_FILE_NAME 256 +#define MAX_MESSAGE_LENGTH 32 +#define MAX_ASSOCIATED_DATA_LENGTH 32 + +void init_buffer(unsigned char *buffer, unsigned long long numbytes); + +void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length); + +int generate_test_vectors(); + +int main() +{ + int ret = generate_test_vectors(); + + if (ret != KAT_SUCCESS) { + fprintf(stderr, "test vector generation failed with code %d\n", ret); + } + + return ret; +} + +int generate_test_vectors() +{ + FILE *fp; + char fileName[MAX_FILE_NAME]; + unsigned char key[CRYPTO_KEYBYTES]; + unsigned char nonce[CRYPTO_NPUBBYTES]; + unsigned char msg[MAX_MESSAGE_LENGTH]; + unsigned char msg2[MAX_MESSAGE_LENGTH]; + unsigned char ad[MAX_ASSOCIATED_DATA_LENGTH]; + unsigned char ct[MAX_MESSAGE_LENGTH + CRYPTO_ABYTES]; + unsigned long long clen, mlen2; + int count = 1; + int func_ret, ret_val = KAT_SUCCESS; + + init_buffer(key, sizeof(key)); + init_buffer(nonce, sizeof(nonce)); + init_buffer(msg, sizeof(msg)); + init_buffer(ad, sizeof(ad)); + + sprintf(fileName, "LWC_AEAD_KAT_%d_%d.txt", (CRYPTO_KEYBYTES * 8), (CRYPTO_NPUBBYTES * 8)); + + if ((fp = fopen(fileName, "w")) == NULL) { + fprintf(stderr, "Couldn't open <%s> for write\n", fileName); + return KAT_FILE_OPEN_ERROR; + } + + for (unsigned long long mlen = 0; (mlen <= MAX_MESSAGE_LENGTH) && (ret_val == KAT_SUCCESS); mlen++) { + for (unsigned long long adlen = 0; adlen <= MAX_ASSOCIATED_DATA_LENGTH; adlen++) { + + printf("%0d\n", (int)clen); + + fprintf(fp, "Count = %d\n", count++); + printf("Count = %d\n", count - 1); + + fprint_bstr(fp, "Key = ", key, CRYPTO_KEYBYTES); + + fprint_bstr(fp, "Nonce = ", nonce, CRYPTO_NPUBBYTES); + + fprint_bstr(fp, "PT = ", msg, mlen); + + fprint_bstr(fp, "AD = ", ad, adlen); + + if ((func_ret = crypto_aead_encrypt(ct, &clen, msg, mlen, ad, adlen, NULL, nonce, key)) != 0) { + fprintf(fp, "crypto_aead_encrypt returned <%d>\n", func_ret); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + + fprint_bstr(fp, "CT = ", ct, clen); + + fprintf(fp, "\n"); + + if ((func_ret = crypto_aead_decrypt(msg2, &mlen2, NULL, ct, clen, ad, adlen, nonce, key)) != 0) { + fprintf(fp, "crypto_aead_decrypt returned <%d>\n", func_ret); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + + if (mlen != mlen2) { + fprintf(fp, "crypto_aead_decrypt returned bad 'mlen': Got <%llu>, expected <%llu>\n", mlen2, mlen); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + + if (memcmp(msg, msg2, mlen)) { + fprintf(fp, "crypto_aead_decrypt did not recover the plaintext\n"); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + } + } + + fclose(fp); + + return ret_val; +} + + +void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length) +{ + fprintf(fp, "%s", label); + + for (unsigned long long i = 0; i < length; i++) + fprintf(fp, "%02X", data[i]); + + fprintf(fp, "\n"); +} + +void init_buffer(unsigned char *buffer, unsigned long long numbytes) +{ + for (unsigned long long i = 0; i < numbytes; i++) + buffer[i] = (unsigned char)i; +} diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny.h b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny.h new file mode 100644 index 0000000..6392b0f --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny.h @@ -0,0 +1,8 @@ +typedef struct ___skinny_ctrl { + unsigned char roundKeys[960]; // number of round : 56 + void (*func_skinny_128_384_enc)(unsigned char*, struct ___skinny_ctrl*, unsigned char* CNT, unsigned char* T, const unsigned char* K); +} skinny_ctrl; + +extern void skinny_128_384_enc123_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K); +extern void skinny_128_384_enc12_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K); +extern void skinny_128_384_enc1_1 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K); diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule2.c b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule2.c new file mode 100644 index 0000000..58006f2 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule2.c @@ -0,0 +1,3027 @@ +/****************************************************************************** + * Copyright (c) 2020, NEC Corporation. + * + * THIS CODE IS FURNISHED TO YOU "AS IS" WITHOUT WARRANTY OF ANY KIND. + * + *****************************************************************************/ + +/* + * SKINNY-128-384 + * + * load * AC(c0 c1) ^ TK3 + * calc AC(c0 c1) ^ TK2 -> store + * ART(TK2) + * + * number of rounds : 56 + */ + +__attribute__((aligned(4))) +void RunEncryptionKeyScheduleTK2(unsigned char *roundKeys) +{ + // r0 : points to roundKeys(& masterKey) + // r1-r4 : key state + // r5-r6 : temp use + // r7 : constant(0xfefefefe) + // r8 : constant(0x01010101) + // r9 : temp use + // r10 : temp use + asm volatile( + "stmdb sp!, {r4-r10} \n\t" + "ldr.w r1, [r0,#16] \n\t" // load master key + "ldr.w r2, [r0,#20] \n\t" // load master key + "ldr.w r3, [r0,#24] \n\t" // load master key + "ldr.w r4, [r0,#28] \n\t" // load master key + "mov.w r7, #0xfefefefe \n\t" + "mov.w r8, #0x01010101 \n\t" + + // round 1 + + "ldr.w r9, [r0,#512] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#516] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#64] \n\t" + "str.w r10, [r0,#68] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 2 + + "ldr.w r9, [r0,#520] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#524] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#72] \n\t" + "str.w r10, [r0,#76] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 3 + + "ldr.w r9, [r0,#528] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#532] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#80] \n\t" + "str.w r10, [r0,#84] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 4 + + "ldr.w r9, [r0,#536] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#540] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#88] \n\t" + "str.w r10, [r0,#92] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 5 + + "ldr.w r9, [r0,#544] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#548] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#96] \n\t" + "str.w r10, [r0,#100] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 6 + + "ldr.w r9, [r0,#552] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#556] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#104] \n\t" + "str.w r10, [r0,#108] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 7 + + "ldr.w r9, [r0,#560] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#564] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#112] \n\t" + "str.w r10, [r0,#116] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 8 + + "ldr.w r9, [r0,#568] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#572] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#120] \n\t" + "str.w r10, [r0,#124] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 9 + + "ldr.w r9, [r0,#576] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#580] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#128] \n\t" + "str.w r10, [r0,#132] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 10 + + "ldr.w r9, [r0,#584] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#588] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#136] \n\t" + "str.w r10, [r0,#140] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 11 + + "ldr.w r9, [r0,#592] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#596] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#144] \n\t" + "str.w r10, [r0,#148] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 12 + + "ldr.w r9, [r0,#600] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#604] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#152] \n\t" + "str.w r10, [r0,#156] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 13 + + "ldr.w r9, [r0,#608] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#612] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#160] \n\t" + "str.w r10, [r0,#164] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 14 + + "ldr.w r9, [r0,#616] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#620] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#168] \n\t" + "str.w r10, [r0,#172] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 15 + + "ldr.w r9, [r0,#624] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#628] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#176] \n\t" + "str.w r10, [r0,#180] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 16 + + "ldr.w r9, [r0,#632] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#636] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#184] \n\t" + "str.w r10, [r0,#188] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 17 + + "ldr.w r9, [r0,#640] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#644] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#192] \n\t" + "str.w r10, [r0,#196] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 18 + + "ldr.w r9, [r0,#648] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#652] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#200] \n\t" + "str.w r10, [r0,#204] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 19 + + "ldr.w r9, [r0,#656] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#660] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#208] \n\t" + "str.w r10, [r0,#212] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 20 + + "ldr.w r9, [r0,#664] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#668] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#216] \n\t" + "str.w r10, [r0,#220] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 21 + + "ldr.w r9, [r0,#672] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#676] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#224] \n\t" + "str.w r10, [r0,#228] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 22 + + "ldr.w r9, [r0,#680] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#684] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#232] \n\t" + "str.w r10, [r0,#236] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 23 + + "ldr.w r9, [r0,#688] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#692] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#240] \n\t" + "str.w r10, [r0,#244] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 24 + + "ldr.w r9, [r0,#696] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#700] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#248] \n\t" + "str.w r10, [r0,#252] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 25 + + "ldr.w r9, [r0,#704] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#708] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#256] \n\t" + "str.w r10, [r0,#260] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 26 + + "ldr.w r9, [r0,#712] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#716] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#264] \n\t" + "str.w r10, [r0,#268] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 27 + + "ldr.w r9, [r0,#720] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#724] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#272] \n\t" + "str.w r10, [r0,#276] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 28 + + "ldr.w r9, [r0,#728] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#732] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#280] \n\t" + "str.w r10, [r0,#284] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 29 + + "ldr.w r9, [r0,#736] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#740] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#288] \n\t" + "str.w r10, [r0,#292] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 30 + + "ldr.w r9, [r0,#744] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#748] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#296] \n\t" + "str.w r10, [r0,#300] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 31 + + "ldr.w r9, [r0,#752] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#756] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#304] \n\t" + "str.w r10, [r0,#308] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 32 + + "ldr.w r9, [r0,#760] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#764] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#312] \n\t" + "str.w r10, [r0,#316] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 33 + + "ldr.w r9, [r0,#768] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#772] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#320] \n\t" + "str.w r10, [r0,#324] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 34 + + "ldr.w r9, [r0,#776] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#780] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#328] \n\t" + "str.w r10, [r0,#332] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 35 + + "ldr.w r9, [r0,#784] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#788] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#336] \n\t" + "str.w r10, [r0,#340] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 36 + + "ldr.w r9, [r0,#792] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#796] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#344] \n\t" + "str.w r10, [r0,#348] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 37 + + "ldr.w r9, [r0,#800] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#804] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#352] \n\t" + "str.w r10, [r0,#356] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 38 + + "ldr.w r9, [r0,#808] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#812] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#360] \n\t" + "str.w r10, [r0,#364] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 39 + + "ldr.w r9, [r0,#816] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#820] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#368] \n\t" + "str.w r10, [r0,#372] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 40 + + "ldr.w r9, [r0,#824] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#828] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#376] \n\t" + "str.w r10, [r0,#380] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 41 + + "ldr.w r9, [r0,#832] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#836] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#384] \n\t" + "str.w r10, [r0,#388] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 42 + + "ldr.w r9, [r0,#840] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#844] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#392] \n\t" + "str.w r10, [r0,#396] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 43 + + "ldr.w r9, [r0,#848] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#852] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#400] \n\t" + "str.w r10, [r0,#404] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 44 + + "ldr.w r9, [r0,#856] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#860] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#408] \n\t" + "str.w r10, [r0,#412] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 45 + + "ldr.w r9, [r0,#864] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#868] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#416] \n\t" + "str.w r10, [r0,#420] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 46 + + "ldr.w r9, [r0,#872] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#876] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#424] \n\t" + "str.w r10, [r0,#428] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 47 + + "ldr.w r9, [r0,#880] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#884] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#432] \n\t" + "str.w r10, [r0,#436] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 48 + + "ldr.w r9, [r0,#888] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#892] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#440] \n\t" + "str.w r10, [r0,#444] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 49 + + "ldr.w r9, [r0,#896] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#900] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#448] \n\t" + "str.w r10, [r0,#452] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 50 + + "ldr.w r9, [r0,#904] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#908] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#456] \n\t" + "str.w r10, [r0,#460] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 51 + + "ldr.w r9, [r0,#912] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#916] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#464] \n\t" + "str.w r10, [r0,#468] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 52 + + "ldr.w r9, [r0,#920] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#924] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#472] \n\t" + "str.w r10, [r0,#476] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 53 + + "ldr.w r9, [r0,#928] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#932] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#480] \n\t" + "str.w r10, [r0,#484] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 54 + + "ldr.w r9, [r0,#936] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#940] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#488] \n\t" + "str.w r10, [r0,#492] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 55 + + "ldr.w r9, [r0,#944] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#948] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#496] \n\t" + "str.w r10, [r0,#500] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 56 + + "ldr.w r9, [r0,#952] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#956] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#504] \n\t" + "str.w r10, [r0,#508] \n\t" + + // permutation + + // not need to calculate (not used) + + "ldmia.w sp!, {r4-r10} \n\t" + : + : [roundKeys] "r" (roundKeys) + : "cc"); +} diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule3.c b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule3.c new file mode 100644 index 0000000..81c4406 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_key_schedule3.c @@ -0,0 +1,2898 @@ +/****************************************************************************** + * Copyright (c) 2020, NEC Corporation. + * + * THIS CODE IS FURNISHED TO YOU "AS IS" WITHOUT WARRANTY OF ANY KIND. + * + *****************************************************************************/ + +/* + * SKINNY-128-384 + * + * AC(c0 c1) ^ TK3 -> store + * ART(TK3) + * + * number of rounds : 56 + */ + +__attribute__((aligned(4))) +void RunEncryptionKeyScheduleTK3(unsigned char *roundKeys, unsigned char *pRC) +{ + // r0 : points to roundKeys(& masterKey) + // r1 : points to RC + // r2-r5 : key state + // r6-r7 : temp use + // r8 : constant(0x7f7f7f7f) + // r9 : constant(0x80808080) + asm volatile( + "stmdb sp!, {r4-r9} \n\t" + "ldr.w r2, [r0,#32] \n\t" // load master key + "ldr.w r3, [r0,#36] \n\t" // load master key + "ldr.w r4, [r0,#40] \n\t" // load master key + "ldr.w r5, [r0,#44] \n\t" // load master key + "mov.w r8, #0x7f7f7f7f \n\t" + "mov.w r9, #0x80808080 \n\t" + + // round 1 + + // AC(c0 c1) + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#512] \n\t" + "str.w r3, [r0,#516] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 2 + + // AC(c0 c1) + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#520] \n\t" + "str.w r3, [r0,#524] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 3 + + // AC(c0 c1) + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#528] \n\t" + "str.w r3, [r0,#532] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 4 + + // AC(c0 c1) + "eor.w r6, r2, #0xf \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#536] \n\t" + "str.w r3, [r0,#540] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 5 + + // AC(c0 c1) + "eor.w r6, r2, #0xf \n\t" // k0^rc + "eor.w r7, r3, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#544] \n\t" + "str.w r7, [r0,#548] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 6 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#552] \n\t" + "str.w r7, [r0,#556] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 7 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xd \n\t" // k0^rc + + + // round key store + "str.w r6, [r0,#560] \n\t" + "str.w r7, [r0,#564] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 8 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#568] \n\t" + "str.w r7, [r0,#572] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 9 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#576] \n\t" + "str.w r7, [r0,#580] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 10 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xf \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#584] \n\t" + "str.w r7, [r0,#588] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 11 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#592] \n\t" + "str.w r7, [r0,#596] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 12 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#600] \n\t" + "str.w r7, [r0,#604] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 13 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#608] \n\t" + "str.w r7, [r0,#612] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 14 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#616] \n\t" + "str.w r7, [r0,#620] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 15 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#624] \n\t" + "str.w r7, [r0,#628] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 16 + + // AC(c0 c1) + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#632] \n\t" + "str.w r3, [r0,#636] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 17 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xd \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#640] \n\t" + "str.w r7, [r0,#644] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 18 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xa \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#648] \n\t" + "str.w r7, [r0,#652] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 19 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x5 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#656] \n\t" + "str.w r7, [r0,#660] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 20 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#664] \n\t" + "str.w r7, [r0,#668] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 21 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#672] \n\t" + "str.w r7, [r0,#676] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 22 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#680] \n\t" + "str.w r7, [r0,#684] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 23 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x8 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#688] \n\t" + "str.w r7, [r0,#692] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 24 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x0 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#696] \n\t" + "str.w r7, [r0,#700] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 25 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#704] \n\t" + "str.w r7, [r0,#708] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 26 + + // AC(c0 c1) + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#712] \n\t" + "str.w r3, [r0,#716] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 27 + + // AC(c0 c1) + "eor.w r6, r2, #0x5 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#720] \n\t" + "str.w r3, [r0,#724] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 28 + + // AC(c0 c1) + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#728] \n\t" + "str.w r3, [r0,#732] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 29 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#736] \n\t" + "str.w r7, [r0,#740] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 30 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#744] \n\t" + "str.w r7, [r0,#748] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 31 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#752] \n\t" + "str.w r7, [r0,#756] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 32 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x8 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#760] \n\t" + "str.w r7, [r0,#764] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 33 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#768] \n\t" + "str.w r7, [r0,#772] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 34 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#776] \n\t" + "str.w r7, [r0,#780] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 35 + + // AC(c0 c1) + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#784] \n\t" + "str.w r3, [r0,#788] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 36 + + // AC(c0 c1) + "eor.w r6, r2, #0xd \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#792] \n\t" + "str.w r3, [r0,#796] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 37 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#800] \n\t" + "str.w r7, [r0,#804] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 38 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#808] \n\t" + "str.w r7, [r0,#812] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 39 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xd \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#816] \n\t" + "str.w r7, [r0,#820] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 40 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xa \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#824] \n\t" + "str.w r7, [r0,#828] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 41 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x4 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#832] \n\t" + "str.w r7, [r0,#836] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 42 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#840] \n\t" + "str.w r7, [r0,#844] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 43 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#848] \n\t" + "str.w r7, [r0,#852] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 44 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x4 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#856] \n\t" + "str.w r7, [r0,#860] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 45 + + // AC(c0 c1) + "eor.w r6, r2, #0x8 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#864] \n\t" + "str.w r3, [r0,#868] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 46 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#872] \n\t" + "str.w r7, [r0,#876] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 47 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#880] \n\t" + "str.w r7, [r0,#884] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 48 + + // AC(c0 c1) + "eor.w r6, r2, #0x4 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#888] \n\t" + "str.w r3, [r0,#892] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 49 + + // AC(c0 c1) + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#896] \n\t" + "str.w r3, [r0,#900] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 50 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#904] \n\t" + "str.w r7, [r0,#908] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 51 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#912] \n\t" + "str.w r7, [r0,#916] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 52 + + // AC(c0 c1) + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#920] \n\t" + "str.w r3, [r0,#924] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 53 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#928] \n\t" + "str.w r7, [r0,#932] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 54 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#936] \n\t" + "str.w r7, [r0,#940] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 55 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x5 \n\t" // k0^rc + // round key store + "str.w r6, [r0,#944] \n\t" + "str.w r7, [r0,#948] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 56 + + // AC(c0 c1) + "eor.w r6, r2, #0xa \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#952] \n\t" + "str.w r3, [r0,#956] \n\t" + + // permutation + + // not need to calculate (not used) + + "ldmia.w sp!, {r4-r9} \n\t" + : + : [roundKeys] "r" (roundKeys), [pRC] "r" (pRC) + : "cc"); +} diff --git a/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_main.c b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_main.c new file mode 100644 index 0000000..217b8a6 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusm1v12/armsrc/skinny_main.c @@ -0,0 +1,4687 @@ +/****************************************************************************** + * Copyright (c) 2020, NEC Corporation. + * + * THIS CODE IS FURNISHED TO YOU "AS IS" WITHOUT WARRANTY OF ANY KIND. + * + *****************************************************************************/ + +/* + * SKINNY-128-384 + * + * ART(TK1) -> store + * load AC(c0 c1) ^ TK3 ^ TK2 + * load TK1 + * calc AC(c0 c1) ^ TK3 ^ TK2 ^ TK1 -> use at (AC->ART) + * SC->SR->(AC->ART)->MC + * + * number of rounds : 56 + */ + +#include "skinny.h" + +/* + * S-BOX + */ +unsigned char SBOX[512] += { + // Original + 0x65, 0x4c, 0x6a, 0x42, 0x4b, 0x63, 0x43, 0x6b, 0x55, 0x75, 0x5a, 0x7a, 0x53, 0x73, 0x5b, 0x7b, + 0x35, 0x8c, 0x3a, 0x81, 0x89, 0x33, 0x80, 0x3b, 0x95, 0x25, 0x98, 0x2a, 0x90, 0x23, 0x99, 0x2b, + 0xe5, 0xcc, 0xe8, 0xc1, 0xc9, 0xe0, 0xc0, 0xe9, 0xd5, 0xf5, 0xd8, 0xf8, 0xd0, 0xf0, 0xd9, 0xf9, + 0xa5, 0x1c, 0xa8, 0x12, 0x1b, 0xa0, 0x13, 0xa9, 0x05, 0xb5, 0x0a, 0xb8, 0x03, 0xb0, 0x0b, 0xb9, + 0x32, 0x88, 0x3c, 0x85, 0x8d, 0x34, 0x84, 0x3d, 0x91, 0x22, 0x9c, 0x2c, 0x94, 0x24, 0x9d, 0x2d, + 0x62, 0x4a, 0x6c, 0x45, 0x4d, 0x64, 0x44, 0x6d, 0x52, 0x72, 0x5c, 0x7c, 0x54, 0x74, 0x5d, 0x7d, + 0xa1, 0x1a, 0xac, 0x15, 0x1d, 0xa4, 0x14, 0xad, 0x02, 0xb1, 0x0c, 0xbc, 0x04, 0xb4, 0x0d, 0xbd, + 0xe1, 0xc8, 0xec, 0xc5, 0xcd, 0xe4, 0xc4, 0xed, 0xd1, 0xf1, 0xdc, 0xfc, 0xd4, 0xf4, 0xdd, 0xfd, + 0x36, 0x8e, 0x38, 0x82, 0x8b, 0x30, 0x83, 0x39, 0x96, 0x26, 0x9a, 0x28, 0x93, 0x20, 0x9b, 0x29, + 0x66, 0x4e, 0x68, 0x41, 0x49, 0x60, 0x40, 0x69, 0x56, 0x76, 0x58, 0x78, 0x50, 0x70, 0x59, 0x79, + 0xa6, 0x1e, 0xaa, 0x11, 0x19, 0xa3, 0x10, 0xab, 0x06, 0xb6, 0x08, 0xba, 0x00, 0xb3, 0x09, 0xbb, + 0xe6, 0xce, 0xea, 0xc2, 0xcb, 0xe3, 0xc3, 0xeb, 0xd6, 0xf6, 0xda, 0xfa, 0xd3, 0xf3, 0xdb, 0xfb, + 0x31, 0x8a, 0x3e, 0x86, 0x8f, 0x37, 0x87, 0x3f, 0x92, 0x21, 0x9e, 0x2e, 0x97, 0x27, 0x9f, 0x2f, + 0x61, 0x48, 0x6e, 0x46, 0x4f, 0x67, 0x47, 0x6f, 0x51, 0x71, 0x5e, 0x7e, 0x57, 0x77, 0x5f, 0x7f, + 0xa2, 0x18, 0xae, 0x16, 0x1f, 0xa7, 0x17, 0xaf, 0x01, 0xb2, 0x0e, 0xbe, 0x07, 0xb7, 0x0f, 0xbf, + 0xe2, 0xca, 0xee, 0xc6, 0xcf, 0xe7, 0xc7, 0xef, 0xd2, 0xf2, 0xde, 0xfe, 0xd7, 0xf7, 0xdf, 0xff, + + // Original ^ c2(0x02) + 0x67, 0x4e, 0x68, 0x40, 0x49, 0x61, 0x41, 0x69, 0x57, 0x77, 0x58, 0x78, 0x51, 0x71, 0x59, 0x79, + 0x37, 0x8e, 0x38, 0x83, 0x8b, 0x31, 0x82, 0x39, 0x97, 0x27, 0x9a, 0x28, 0x92, 0x21, 0x9b, 0x29, + 0xe7, 0xce, 0xea, 0xc3, 0xcb, 0xe2, 0xc2, 0xeb, 0xd7, 0xf7, 0xda, 0xfa, 0xd2, 0xf2, 0xdb, 0xfb, + 0xa7, 0x1e, 0xaa, 0x10, 0x19, 0xa2, 0x11, 0xab, 0x07, 0xb7, 0x08, 0xba, 0x01, 0xb2, 0x09, 0xbb, + 0x30, 0x8a, 0x3e, 0x87, 0x8f, 0x36, 0x86, 0x3f, 0x93, 0x20, 0x9e, 0x2e, 0x96, 0x26, 0x9f, 0x2f, + 0x60, 0x48, 0x6e, 0x47, 0x4f, 0x66, 0x46, 0x6f, 0x50, 0x70, 0x5e, 0x7e, 0x56, 0x76, 0x5f, 0x7f, + 0xa3, 0x18, 0xae, 0x17, 0x1f, 0xa6, 0x16, 0xaf, 0x00, 0xb3, 0x0e, 0xbe, 0x06, 0xb6, 0x0f, 0xbf, + 0xe3, 0xca, 0xee, 0xc7, 0xcf, 0xe6, 0xc6, 0xef, 0xd3, 0xf3, 0xde, 0xfe, 0xd6, 0xf6, 0xdf, 0xff, + 0x34, 0x8c, 0x3a, 0x80, 0x89, 0x32, 0x81, 0x3b, 0x94, 0x24, 0x98, 0x2a, 0x91, 0x22, 0x99, 0x2b, + 0x64, 0x4c, 0x6a, 0x43, 0x4b, 0x62, 0x42, 0x6b, 0x54, 0x74, 0x5a, 0x7a, 0x52, 0x72, 0x5b, 0x7b, + 0xa4, 0x1c, 0xa8, 0x13, 0x1b, 0xa1, 0x12, 0xa9, 0x04, 0xb4, 0x0a, 0xb8, 0x02, 0xb1, 0x0b, 0xb9, + 0xe4, 0xcc, 0xe8, 0xc0, 0xc9, 0xe1, 0xc1, 0xe9, 0xd4, 0xf4, 0xd8, 0xf8, 0xd1, 0xf1, 0xd9, 0xf9, + 0x33, 0x88, 0x3c, 0x84, 0x8d, 0x35, 0x85, 0x3d, 0x90, 0x23, 0x9c, 0x2c, 0x95, 0x25, 0x9d, 0x2d, + 0x63, 0x4a, 0x6c, 0x44, 0x4d, 0x65, 0x45, 0x6d, 0x53, 0x73, 0x5c, 0x7c, 0x55, 0x75, 0x5d, 0x7d, + 0xa0, 0x1a, 0xac, 0x14, 0x1d, 0xa5, 0x15, 0xad, 0x03, 0xb0, 0x0c, 0xbc, 0x05, 0xb5, 0x0d, 0xbd, + 0xe0, 0xc8, 0xec, 0xc4, 0xcd, 0xe5, 0xc5, 0xed, 0xd0, 0xf0, 0xdc, 0xfc, 0xd5, 0xf5, 0xdd, 0xfd, +}; + +/* + * Round Constants + */ +unsigned char RC[56] += { + 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3E, 0x3D, 0x3B, + 0x37, 0x2F, 0x1E, 0x3C, 0x39, 0x33, 0x27, 0x0E, + 0x1D, 0x3A, 0x35, 0x2B, 0x16, 0x2C, 0x18, 0x30, + 0x21, 0x02, 0x05, 0x0B, 0x17, 0x2E, 0x1C, 0x38, + 0x31, 0x23, 0x06, 0x0D, 0x1B, 0x36, 0x2D, 0x1A, + 0x34, 0x29, 0x12, 0x24, 0x08, 0x11, 0x22, 0x04, + 0x09, 0x13, 0x26, 0x0c, 0x19, 0x32, 0x25, 0x0a,}; + +extern void Encrypt(unsigned char *block, unsigned char *roundKeys, unsigned char *pSBOX) __attribute__((noinline)); +extern void RunEncryptionKeyScheduleTK2(unsigned char *roundKeys) __attribute__((noinline)); +extern void RunEncryptionKeyScheduleTK3(unsigned char *roundKeys, unsigned char *pRC) __attribute__((noinline)); + +void skinny_128_384_enc123_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K) +{ + *((unsigned int *)&pskinny_ctrl->roundKeys[0] ) = *((unsigned int *)&CNT[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[4] ) = *((unsigned int *)&CNT[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[16]) = *((unsigned int *)&T[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[20]) = *((unsigned int *)&T[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[24]) = *((unsigned int *)&T[8]); + *((unsigned int *)&pskinny_ctrl->roundKeys[28]) = *((unsigned int *)&T[12]); + *((unsigned int *)&pskinny_ctrl->roundKeys[32]) = *((unsigned int *)&K[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[36]) = *((unsigned int *)&K[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[40]) = *((unsigned int *)&K[8]); + *((unsigned int *)&pskinny_ctrl->roundKeys[44]) = *((unsigned int *)&K[12]); + + RunEncryptionKeyScheduleTK3(pskinny_ctrl->roundKeys, RC); + RunEncryptionKeyScheduleTK2(pskinny_ctrl->roundKeys); + Encrypt(input, pskinny_ctrl->roundKeys, SBOX); + + pskinny_ctrl->func_skinny_128_384_enc = skinny_128_384_enc12_12; +} + +void skinny_128_384_enc12_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K) +{ + (void)K; + + *((unsigned int *)&pskinny_ctrl->roundKeys[0] ) = *((unsigned int *)&CNT[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[4] ) = *((unsigned int *)&CNT[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[16]) = *((unsigned int *)&T[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[20]) = *((unsigned int *)&T[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[24]) = *((unsigned int *)&T[8]); + *((unsigned int *)&pskinny_ctrl->roundKeys[28]) = *((unsigned int *)&T[12]); + + RunEncryptionKeyScheduleTK2(pskinny_ctrl->roundKeys); + Encrypt(input, pskinny_ctrl->roundKeys, SBOX); +} + +extern void skinny_128_384_enc1_1 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K) +{ + (void)T; + (void)K; + + *((unsigned int *)&pskinny_ctrl->roundKeys[0] ) = *((unsigned int *)&CNT[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[4] ) = *((unsigned int *)&CNT[4]); + + Encrypt(input, pskinny_ctrl->roundKeys, SBOX); +} + +__attribute__((aligned(4))) +void Encrypt(unsigned char *block, unsigned char *roundKeys, unsigned char *pSBOX) +{ + // r0 : ponits to plaintext + // r1 : points to roundKeys(& masterKey) + // r2 : points to SBOX + // r3-r6 : cipher state + // r7-r12: temp use + // r14 : temp use + asm volatile( + "stmdb sp!, {r4-r12,r14} \n\t" + "stmdb.w sp!, {r0} \n\t" // push store pointer + +// ART(TK1) + + "ldm.w r1, {r3-r4} \n\t" // load master key + + // round 1-2 + +// // round key store(do not need) +// "str.w r3, [r1,#0] \n\t" +// "str.w r4, [r1,#4] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 3-4 + + // round key store + "str.w r5, [r1,#8] \n\t" + "str.w r6, [r1,#12] \n\t" + + // premutation + + // r3 (--- --- --- ---) k5 k0 k7 k1 + // r4 (--- --- --- ---) k3 k4 k6 k2 + // r5 ( k3 k2 k1 k0) -----> --- --- --- --- + // r6 ( k7 k6 k5 k4) --- --- --- --- +#ifdef STM32F4 // for Cortex-M4 + "ror.w r3,r6, #16 \n\t" // r3( k5 k4 k7 k6) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 k6) + "pkhtb.w r4,r5, r5, asr #16 \n\t" // r4( k3 k2 k3 k2) + "ror.w r5, #8 \n\t" // r5( k0 k3 k2 k1) + "bfi.w r3,r5, #0,#8 \n\t" // r3( k5 k4 k2 k6) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k3 k2) + "ror.w r6,#16 \n\t" // r6( k5 k4 k7 k6) + "bfi.w r4,r6, #8,#8 \n\t" // r4( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r3, r6 \n\t" // r3( k4 k5 k6 k7) + "lsl.w r3, r3, #8 \n\t" // r3( k5 k6 k7 ---) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 ---) + "lsr.w r5, r5, #8 \n\t" // r5(--- k3 k2 k1) + "bfi.w r3,r5, #0, #8 \n\t" // r3( k5 k0 k7 k1) + "rev16.w r4, r5 \n\t" // r4( k3 --- k1 k2) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k1 k2) + "lsr.w r6, r6, #16 \n\t" // r6(-- --- k7 k6) + "bfi.w r4,r6, #8, #8 \n\t" // r4( k3 k4 k6 k2) +#endif + + // round 5-6 + + // round key store + "str.w r3, [r1,#16] \n\t" + "str.w r4, [r1,#20] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 7-8 + + // round key store + "str.w r5, [r1,#24] \n\t" + "str.w r6, [r1,#28] \n\t" + + // premutation + + // r3 (--- --- --- ---) k5 k0 k7 k1 + // r4 (--- --- --- ---) k3 k4 k6 k2 + // r5 ( k3 k2 k1 k0) -----> --- --- --- --- + // r6 ( k7 k6 k5 k4) --- --- --- --- +#ifdef STM32F4 // for Cortex-M4 + "ror.w r3,r6, #16 \n\t" // r3( k5 k4 k7 k6) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 k6) + "pkhtb.w r4,r5, r5, asr #16 \n\t" // r4( k3 k2 k3 k2) + "ror.w r5, #8 \n\t" // r5( k0 k3 k2 k1) + "bfi.w r3,r5, #0,#8 \n\t" // r3( k5 k4 k2 k6) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k3 k2) + "ror.w r6,#16 \n\t" // r6( k5 k4 k7 k6) + "bfi.w r4,r6, #8,#8 \n\t" // r4( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r3, r6 \n\t" // r3( k4 k5 k6 k7) + "lsl.w r3, r3, #8 \n\t" // r3( k5 k6 k7 ---) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 ---) + "lsr.w r5, r5, #8 \n\t" // r5(--- k3 k2 k1) + "bfi.w r3,r5, #0, #8 \n\t" // r3( k5 k0 k7 k1) + "rev16.w r4, r5 \n\t" // r4( k3 --- k1 k2) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k1 k2) + "lsr.w r6, r6, #16 \n\t" // r6(-- --- k7 k6) + "bfi.w r4,r6, #8, #8 \n\t" // r4( k3 k4 k6 k2) +#endif + + // round 9-10 + + // round key store + "str.w r3, [r1,#32] \n\t" + "str.w r4, [r1,#36] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 11-12 + + // round key store + "str.w r5, [r1,#40] \n\t" + "str.w r6, [r1,#44] \n\t" + + // premutation + + // r3 (--- --- --- ---) k5 k0 k7 k1 + // r4 (--- --- --- ---) k3 k4 k6 k2 + // r5 ( k3 k2 k1 k0) -----> --- --- --- --- + // r6 ( k7 k6 k5 k4) --- --- --- --- +#ifdef STM32F4 // for Cortex-M4 + "ror.w r3,r6, #16 \n\t" // r3( k5 k4 k7 k6) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 k6) + "pkhtb.w r4,r5, r5, asr #16 \n\t" // r4( k3 k2 k3 k2) + "ror.w r5, #8 \n\t" // r5( k0 k3 k2 k1) + "bfi.w r3,r5, #0,#8 \n\t" // r3( k5 k4 k2 k6) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k3 k2) + "ror.w r6,#16 \n\t" // r6( k5 k4 k7 k6) + "bfi.w r4,r6, #8,#8 \n\t" // r4( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r3, r6 \n\t" // r3( k4 k5 k6 k7) + "lsl.w r3, r3, #8 \n\t" // r3( k5 k6 k7 ---) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 ---) + "lsr.w r5, r5, #8 \n\t" // r5(--- k3 k2 k1) + "bfi.w r3,r5, #0, #8 \n\t" // r3( k5 k0 k7 k1) + "rev16.w r4, r5 \n\t" // r4( k3 --- k1 k2) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k1 k2) + "lsr.w r6, r6, #16 \n\t" // r6(-- --- k7 k6) + "bfi.w r4,r6, #8, #8 \n\t" // r4( k3 k4 k6 k2) +#endif + + // round 13-14 + + // round key store + "str.w r3, [r1,#48] \n\t" + "str.w r4, [r1,#52] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 15-16 + + // round key store + "str.w r5, [r1,#56] \n\t" + "str.w r6, [r1,#60] \n\t" + + // premutation + + // not need to calculate (not used) + +// SC->(AC->ART)->SR->MC + + "add.w r14, r2, #256 \n\t" // point to SBOX ^ c2(0x02) + + "ldm.w r0, {r3-r6} \n\t" // load plaintext + // r0 now free to overwrite + + // round 1 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#64] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#68] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 2 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#72] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#76] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 3 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#80] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#84] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 4 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#88] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#92] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 5 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#96] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#100] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 6 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#104] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#108] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 7 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#112] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#116] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 8 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#120] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#124] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 9 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#32] \n\t" // load TK1 + "ldr.w r10, [r1,#36] \n\t" // load TK1 + "ldr.w r11, [r1,#128] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#132] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 10 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#136] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#140] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 11 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#40] \n\t" // load TK1 + "ldr.w r10, [r1,#44] \n\t" // load TK1 + "ldr.w r11, [r1,#144] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#148] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 12 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#152] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#156] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 13 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#48] \n\t" // load TK1 + "ldr.w r10, [r1,#52] \n\t" // load TK1 + "ldr.w r11, [r1,#160] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#164] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 14 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#168] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#172] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 15 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#56] \n\t" // load TK1 + "ldr.w r10, [r1,#60] \n\t" // load TK1 + "ldr.w r11, [r1,#176] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#180] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 16 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#184] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#188] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 17 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#192] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#196] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 18 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#200] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#204] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 19 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#208] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#212] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 20 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#216] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#220] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 21 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#224] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#228] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 22 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#232] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#236] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 23 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#240] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#244] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 24 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#248] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#252] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 25 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#32] \n\t" // load TK1 + "ldr.w r10, [r1,#36] \n\t" // load TK1 + "ldr.w r11, [r1,#256] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#260] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 26 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#264] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#268] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 27 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#40] \n\t" // load TK1 + "ldr.w r10, [r1,#44] \n\t" // load TK1 + "ldr.w r11, [r1,#272] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#276] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 28 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#280] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#284] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 29 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#48] \n\t" // load TK1 + "ldr.w r10, [r1,#52] \n\t" // load TK1 + "ldr.w r11, [r1,#288] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#292] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 30 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#296] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#300] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 31 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#56] \n\t" // load TK1 + "ldr.w r10, [r1,#60] \n\t" // load TK1 + "ldr.w r11, [r1,#304] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#308] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 32 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#312] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#316] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 33 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#320] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#324] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 34 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#328] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#332] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 35 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#336] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#340] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 36 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#344] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#348] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 37 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#352] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#356] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 38 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#360] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#364] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 39 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#368] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#372] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 40 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#376] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#380] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + + // round 41 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#32] \n\t" // load TK1 + "ldr.w r10, [r1,#36] \n\t" // load TK1 + "ldr.w r11, [r1,#384] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#388] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 42 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#392] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#396] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 43 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#40] \n\t" // load TK1 + "ldr.w r10, [r1,#44] \n\t" // load TK1 + "ldr.w r11, [r1,#400] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#404] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 44 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#408] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#412] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 45 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#48] \n\t" // load TK1 + "ldr.w r10, [r1,#52] \n\t" // load TK1 + "ldr.w r11, [r1,#416] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#420] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 46 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#424] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#428] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 47 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#56] \n\t" // load TK1 + "ldr.w r10, [r1,#60] \n\t" // load TK1 + "ldr.w r11, [r1,#432] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#436] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 48 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#440] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#444] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 49 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#448] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#452] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 50 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#456] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#460] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 51 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#464] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#468] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 52 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#472] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#476] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 53 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#480] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#484] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 54 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#488] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#492] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 55 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#496] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#500] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 56 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#504] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#508] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + + "ldmia.w sp!, {r0} \n\t" // pop store pointer + // r0 reload + + "str.w r3, [r0,#0] \n\t" // store ciphertext + "str.w r4, [r0,#4] \n\t" // store ciphertext + "str.w r5, [r0,#8] \n\t" // store ciphertext + "str.w r6, [r0,#12] \n\t" // store ciphertext + + "ldmia.w sp!, {r4-r12,r14} \n\t" + : + : [block] "r" (block), [roundKeys] "r" (roundKeys), [pSBOX] "" (pSBOX) + : "cc"); +} + diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/api.h b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/api.h new file mode 100644 index 0000000..a4aa567 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/api.h @@ -0,0 +1,5 @@ +#define CRYPTO_KEYBYTES 16 +#define CRYPTO_NSECBYTES 0 +#define CRYPTO_NPUBBYTES 16 +#define CRYPTO_ABYTES 16 +#define CRYPTO_NOOVERLAP 1 diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/crypto_aead.h b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/crypto_aead.h new file mode 100644 index 0000000..cfc09d6 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/crypto_aead.h @@ -0,0 +1,11 @@ +int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen, + const unsigned char *m, unsigned long long mlen, + const unsigned char *ad, unsigned long long adlen, + const unsigned char *nsec, const unsigned char *npub, + const unsigned char *k); + +int crypto_aead_decrypt(unsigned char *m, unsigned long long *outputmlen, + unsigned char *nsec, + const unsigned char *c, unsigned long long clen, + const unsigned char *ad, unsigned long long adlen, + const unsigned char *npub, const unsigned char *k); diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/encrypt.c b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/encrypt.c new file mode 100644 index 0000000..e953677 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/encrypt.c @@ -0,0 +1,540 @@ +/* + * Date: 29 November 2018 + * Contact: Thomas Peyrin - thomas.peyrin@gmail.com + * Mustafa Khairallah - mustafam001@e.ntu.edu.sg + */ + +#include "crypto_aead.h" +#include "api.h" +#include "skinny.h" +#include +#include + +void pad (const unsigned char* m, unsigned char* mp, int l, int len8) { + int i; + + for (i = 0; i < l; i++) { + if (i < len8) { + mp[i] = m[i]; + } + else if (i == l - 1) { + mp[i] = (len8 & 0x0f); + } + else { + mp[i] = 0x00; + } + } + +} + +void g8A (unsigned char* s, unsigned char* c) { + unsigned int tmps[4]; + unsigned int tmpc[4]; + + tmps[0] = *((unsigned int *)&s[0]); + tmps[1] = *((unsigned int *)&s[4]); + tmps[2] = *((unsigned int *)&s[8]); + tmps[3] = *((unsigned int *)&s[12]); + + // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7); + // + // (s[i] >> 1) -> ((s[i]>>1)&0x7f) + // (s[i] & 0x80) -> (s[i])&0x80) not changed + // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80) + + // use word access because of speeding up + tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080); + tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080); + tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080); + tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080); + + *((unsigned int *)&c[0]) = tmpc[0]; + *((unsigned int *)&c[4]) = tmpc[1]; + *((unsigned int *)&c[8]) = tmpc[2]; + *((unsigned int *)&c[12]) = tmpc[3]; +} + +void g8A_for_Tag_Generation (unsigned char* s, unsigned char* c) { + unsigned int tmps[4]; + unsigned int tmpc[4]; + + tmps[0] = *((unsigned int *)&s[0]); + tmps[1] = *((unsigned int *)&s[4]); + tmps[2] = *((unsigned int *)&s[8]); + tmps[3] = *((unsigned int *)&s[12]); + + // c[i] = (s[i] >> 1) ^ (s[i] & 0x80) ^ ((s[i] & 0x01) << 7); + // + // (s[i] >> 1) -> ((s[i]>>1)&0x7f) + // (s[i] & 0x80) -> (s[i])&0x80) not changed + // ((s[i] & 0x01) << 7) -> ((s[i]<<7)&0x80) + + // use word access because of speeding up + tmpc[0] = ((tmps[0]>>1) & 0x7f7f7f7f) ^ (tmps[0] & 0x80808080) ^ ((tmps[0]<<7) & 0x80808080); + tmpc[1] = ((tmps[1]>>1) & 0x7f7f7f7f) ^ (tmps[1] & 0x80808080) ^ ((tmps[1]<<7) & 0x80808080); + tmpc[2] = ((tmps[2]>>1) & 0x7f7f7f7f) ^ (tmps[2] & 0x80808080) ^ ((tmps[2]<<7) & 0x80808080); + tmpc[3] = ((tmps[3]>>1) & 0x7f7f7f7f) ^ (tmps[3] & 0x80808080) ^ ((tmps[3]<<7) & 0x80808080); + + // use byte access because of memory alignment. + // c is not always in word(4 byte) alignment. + c[0] = tmpc[0] &0xFF; + c[1] = (tmpc[0]>>8) &0xFF; + c[2] = (tmpc[0]>>16)&0xFF; + c[3] = (tmpc[0]>>24)&0xFF; + c[4] = tmpc[1] &0xFF; + c[5] = (tmpc[1]>>8) &0xFF; + c[6] = (tmpc[1]>>16)&0xFF; + c[7] = (tmpc[1]>>24)&0xFF; + c[8] = tmpc[2] &0xFF; + c[9] = (tmpc[2]>>8) &0xFF; + c[10] = (tmpc[2]>>16)&0xFF; + c[11] = (tmpc[2]>>24)&0xFF; + c[12] = tmpc[3] &0xFF; + c[13] = (tmpc[3]>>8) &0xFF; + c[14] = (tmpc[3]>>16)&0xFF; + c[15] = (tmpc[3]>>24)&0xFF; +} + +void rho_ad_eqov16 (const unsigned char* m, + unsigned char* s) { + *((unsigned int *)&s[0]) ^= *((unsigned int *)&m[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&m[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&m[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]); +} + +void rho_ad_ud16 (const unsigned char* m, + unsigned char* s, + int len8) { + unsigned char mp [16]; + + pad(m,mp,16,len8); + *((unsigned int *)&s[0]) ^= *((unsigned int *)&mp[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&mp[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&mp[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]); +} + +void rho_eqov16 (const unsigned char* m, + unsigned char* c, + unsigned char* s) { + g8A(s,c); + + *((unsigned int *)&s[0]) ^= *((unsigned int *)&m[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&m[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&m[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&m[12]); + + *((unsigned int *)&c[0]) ^= *((unsigned int *)&m[0]); + *((unsigned int *)&c[4]) ^= *((unsigned int *)&m[4]); + *((unsigned int *)&c[8]) ^= *((unsigned int *)&m[8]); + *((unsigned int *)&c[12]) ^= *((unsigned int *)&m[12]); +} + +void rho_ud16 (const unsigned char* m, + unsigned char* c, + unsigned char* s, + int len8, + int ver) { + int i; + unsigned char mp [16]; + + pad(m,mp,ver,len8); + + g8A(s,c); + *((unsigned int *)&s[0]) ^= *((unsigned int *)&mp[0]); + *((unsigned int *)&s[4]) ^= *((unsigned int *)&mp[4]); + *((unsigned int *)&s[8]) ^= *((unsigned int *)&mp[8]); + *((unsigned int *)&s[12]) ^= *((unsigned int *)&mp[12]); + for (i = 0; i < ver; i++) { + if (i < len8) { + c[i] = c[i] ^ mp[i]; + } + else { + c[i] = 0; + } + } +} + +void irho (unsigned char* m, + const unsigned char* c, + unsigned char* s, + int len8, + int ver) { + int i; + unsigned char cp [16]; + + pad(c,cp,ver,len8); + + g8A(s,m); + for (i = 0; i < ver; i++) { + if (i < len8) { + s[i] = s[i] ^ cp[i] ^ m[i]; + } + else { + s[i] = s[i] ^ cp[i]; + } + if (i < len8) { + m[i] = m[i] ^ cp[i]; + } + else { + m[i] = 0; + } + } +} + +void reset_lfsr_gf56 (unsigned char* CNT) { + *((unsigned int *)&CNT[0]) = 0x00000001; + *((unsigned int *)&CNT[4]) = 0x00000000; +} + +void lfsr_gf56 (unsigned char* CNT) { + unsigned int tmpCNT[2]; + unsigned int fb0; + + tmpCNT[0] = *((unsigned int *)&CNT[0]); // CNT3 CNT2 CNT1 CNT0 + tmpCNT[1] = *((unsigned int *)&CNT[4]); // CNT7 CNT6 CNT5 CNT4 + + fb0 = 0; + if ((tmpCNT[1] >> 23)&0x01) { + fb0 = 0x95; + } + + tmpCNT[1] = tmpCNT[1] << 1 | tmpCNT[0] >> 31; + tmpCNT[0] = tmpCNT[0] << 1 ^ fb0; + + *((unsigned int *)&CNT[0]) = tmpCNT[0]; + *((unsigned int *)&CNT[4]) = tmpCNT[1]; +} + +void block_cipher(unsigned char* s, + const unsigned char* k, unsigned char* T, + unsigned char* CNT, + skinny_ctrl* p_skinny_ctrl) { + p_skinny_ctrl->func_skinny_128_384_enc (s,p_skinny_ctrl,CNT,T,k); +} + +void nonce_encryption (const unsigned char* N, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + skinny_ctrl* p_skinny_ctrl) { + unsigned char T [16]; + + *((unsigned int *)&T[0]) = *((unsigned int *)&N[0]); + *((unsigned int *)&T[4]) = *((unsigned int *)&N[4]); + *((unsigned int *)&T[8]) = *((unsigned int *)&N[8]); + *((unsigned int *)&T[12]) = *((unsigned int *)&N[12]); + CNT[7] = D; + block_cipher(s,k,T,CNT,p_skinny_ctrl); + +} + +void generate_tag (unsigned char** c, unsigned char* s, + int n, unsigned long long* clen) { + g8A_for_Tag_Generation(s, *c); + *c = *c + n; + *c = *c - *clen; +} + +unsigned long long msg_encryption_eqov16 (const unsigned char** M, unsigned char** c, + const unsigned char* N, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + unsigned long long mlen, + skinny_ctrl* p_skinny_ctrl) { + rho_eqov16(*M, *c, s); + *c = *c + 16; + *M = *M + 16; + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl); + return mlen - 16; +} + +unsigned long long msg_encryption_ud16 (const unsigned char** M, unsigned char** c, + const unsigned char* N, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + unsigned long long mlen, + skinny_ctrl* p_skinny_ctrl) { + rho_ud16(*M, *c, s, mlen, 16); + *c = *c + mlen; + *M = *M + mlen; + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl); + return 0; +} + +unsigned long long msg_decryption (unsigned char** M, const unsigned char** c, + const unsigned char* N, + unsigned char* CNT, + unsigned char*s, const unsigned char* k, + unsigned char D, + unsigned long long clen, + skinny_ctrl* p_skinny_ctrl) { + int len8; + + if (clen >= 16) { + len8 = 16; + clen = clen - 16; + } + else { + len8 = clen; + clen = 0; + } + irho(*M, *c, s, len8, 16); + *c = *c + len8; + *M = *M + len8; + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,D,p_skinny_ctrl); + return clen; +} + +unsigned long long ad_encryption_eqov32 (const unsigned char** A, unsigned char* s, + const unsigned char* k, unsigned long long adlen, + unsigned char* CNT, + unsigned char D, + skinny_ctrl* p_skinny_ctrl) { + + unsigned char T [16]; + + rho_ad_eqov16(*A, s); + *A = *A + 16; + lfsr_gf56(CNT); + + //pad(*A, T, 16, 16); + *((unsigned int *)&T[0]) = *((unsigned int *)&(*A)[0]); + *((unsigned int *)&T[4]) = *((unsigned int *)&(*A)[4]); + *((unsigned int *)&T[8]) = *((unsigned int *)&(*A)[8]); + *((unsigned int *)&T[12]) = *((unsigned int *)&(*A)[12]); + *A = *A + 16; + CNT[7] = D; + block_cipher(s,k,T,CNT,p_skinny_ctrl); + lfsr_gf56(CNT); + + return adlen - 32; +} + +unsigned long long ad_encryption_ov16 (const unsigned char** A, unsigned char* s, + const unsigned char* k, unsigned long long adlen, + unsigned char* CNT, + unsigned char D, + skinny_ctrl* p_skinny_ctrl) { + + unsigned char T [16]; + + adlen = adlen - 16; + rho_ad_eqov16(*A, s); + *A = *A + 16; + lfsr_gf56(CNT); + + pad(*A, T, 16, adlen); + *A = *A + adlen; + CNT[7] = D; + block_cipher(s,k,T,CNT,p_skinny_ctrl); + lfsr_gf56(CNT); + + return 0; +} + +unsigned long long ad_encryption_eq16 (const unsigned char** A, unsigned char* s, + unsigned char* CNT) { + + rho_ad_eqov16(*A, s); + *A = *A + 16; + lfsr_gf56(CNT); + + return 0; +} + +unsigned long long ad_encryption_ud16 (const unsigned char** A, unsigned char* s, + unsigned long long adlen, + unsigned char* CNT) { + + rho_ad_ud16(*A, s, adlen); + *A = *A + adlen; + lfsr_gf56(CNT); + + return 0; +} + +int crypto_aead_encrypt ( + unsigned char* c, unsigned long long* clen, + const unsigned char* m, unsigned long long mlen, + const unsigned char* ad, unsigned long long adlen, + const unsigned char* nsec, + const unsigned char* npub, + const unsigned char* k + ) +{ + unsigned char s[16]; + // size 7 -> 8 for word access + unsigned char CNT[8]; + const unsigned char* A; + const unsigned char* M; + const unsigned char* N; + + skinny_ctrl l_skinny_ctrl; + + (void) nsec; + A = ad; + M = m; + N = npub; + + l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12; + + *((unsigned int *)&s[0]) = 0x00000000; + *((unsigned int *)&s[4]) = 0x00000000; + *((unsigned int *)&s[8]) = 0x00000000; + *((unsigned int *)&s[12]) = 0x00000000; + reset_lfsr_gf56(CNT); + + if (adlen == 0) { // AD is an empty string + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl); + } + else while (adlen > 0) { + if (adlen < 16) { // The last block of AD is odd and incomplete + adlen = ad_encryption_ud16(&A,s,adlen,CNT); + nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl); + } + else if (adlen == 16) { // The last block of AD is odd and complete + adlen = ad_encryption_eq16(&A,s,CNT); + nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl); + } + else if (adlen < (32)) { // The last block of AD is even and incomplete + adlen = ad_encryption_ov16(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl); + nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl); + } + else if (adlen == (32)) { // The last block of AD is even and complete + adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl); + nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl); + } + else { // A normal full pair of blocks of AD + adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl); + } + } + + // because, nonce_encryption is called at the last block of AD encryption + l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc1_1; + + reset_lfsr_gf56(CNT); + + *clen = mlen + 16; + + if (mlen == 0) { // M is an empty string + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,0x15,&l_skinny_ctrl); + } + else while (mlen > 0) { + if (mlen < 16) { // The last block of M is incomplete + mlen = msg_encryption_ud16(&M,&c,N,CNT,s,k,0x15,mlen,&l_skinny_ctrl); + } + else if (mlen == 16) { // The last block of M is complete + mlen = msg_encryption_eqov16(&M,&c,N,CNT,s,k,0x14,mlen,&l_skinny_ctrl); + } + else { // A normal full message block + mlen = msg_encryption_eqov16(&M,&c,N,CNT,s,k,0x04,mlen,&l_skinny_ctrl); + } + } + + // Tag generation + generate_tag(&c,s,16,clen); + + return 0; +} + +int crypto_aead_decrypt( +unsigned char *m,unsigned long long *mlen, +unsigned char *nsec, +const unsigned char *c,unsigned long long clen, +const unsigned char *ad,unsigned long long adlen, +const unsigned char *npub, +const unsigned char *k +) +{ + + unsigned char s[16]; + unsigned char T[16]; + // size 7 -> 8 for word access + unsigned char CNT[8]; + const unsigned char* A; + unsigned char* M; + const unsigned char* N; + unsigned int i; + + skinny_ctrl l_skinny_ctrl; + + (void) nsec; + A = ad; + M = m; + N = npub; + + l_skinny_ctrl.func_skinny_128_384_enc = skinny_128_384_enc123_12; + + for (i = 0; i < 16; i++) { + s[i] = 0; + } + reset_lfsr_gf56(CNT); + + if (adlen == 0) { // AD is an empty string + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl); + } + else while (adlen > 0) { + if (adlen < 16) { // The last block of AD is odd and incomplete + adlen = ad_encryption_ud16(&A,s,adlen,CNT); + nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl); + } + else if (adlen == 16) { // The last block of AD is odd and complete + adlen = ad_encryption_eq16(&A,s,CNT); + nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl); + } + else if (adlen < (32)) { // The last block of AD is even and incomplete + adlen = ad_encryption_ov16(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl); + nonce_encryption(N,CNT,s,k,0x1a,&l_skinny_ctrl); + } + else if (adlen == (32)) { // The last block of AD is even and complete + adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl); + nonce_encryption(N,CNT,s,k,0x18,&l_skinny_ctrl); + } + else { // A normal full pair of blocks of AD + adlen = ad_encryption_eqov32(&A,s,k,adlen,CNT,0x08,&l_skinny_ctrl); + } + } + + reset_lfsr_gf56(CNT); + + clen = clen - 16; + *mlen = clen; + + if (clen == 0) { // C is an empty string + lfsr_gf56(CNT); + nonce_encryption(N,CNT,s,k,0x15,&l_skinny_ctrl); + } + else while (clen > 0) { + if (clen < 16) { // The last block of C is incomplete + clen = msg_decryption(&M,&c,N,CNT,s,k,0x15,clen,&l_skinny_ctrl); + } + else if (clen == 16) { // The last block of C is complete + clen = msg_decryption(&M,&c,N,CNT,s,k,0x14,clen,&l_skinny_ctrl); + } + else { // A normal full message block + clen = msg_decryption(&M,&c,N,CNT,s,k,0x04,clen,&l_skinny_ctrl); + } + } + + // Tag generation + g8A_for_Tag_Generation(s, T); + for (i = 0; i < 16; i++) { + if (T[i] != (*(c+i))) { + return -1; + } + } + + return 0; +} + + diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/genkat_aead.c b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/genkat_aead.c new file mode 100644 index 0000000..21f840f --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/genkat_aead.c @@ -0,0 +1,161 @@ +// +// NIST-developed software is provided by NIST as a public service. +// You may use, copy and distribute copies of the software in any medium, +// provided that you keep intact this entire notice. You may improve, +// modify and create derivative works of the software or any portion of +// the software, and you may copy and distribute such modifications or +// works. Modified works should carry a notice stating that you changed +// the software and should note the date and nature of any such change. +// Please explicitly acknowledge the National Institute of Standards and +// Technology as the source of the software. +// +// NIST-developed software is expressly provided "AS IS." NIST MAKES NO +// WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT OR ARISING BY OPERATION +// OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT AND DATA ACCURACY. NIST +// NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE +// UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST +// DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE +// OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, +// RELIABILITY, OR USEFULNESS OF THE SOFTWARE. +// +// You are solely responsible for determining the appropriateness of using and +// distributing the software and you assume all risks associated with its use, +// including but not limited to the risks and costs of program errors, compliance +// with applicable laws, damage to or loss of data, programs or equipment, and +// the unavailability or interruption of operation. This software is not intended +// to be used in any situation where a failure could cause risk of injury or +// damage to property. The software developed by NIST employees is not subject to +// copyright protection within the United States. +// + +// disable deprecation for sprintf and fopen +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#endif + +#include +#include + +#include "crypto_aead.h" +#include "api.h" + +#define KAT_SUCCESS 0 +#define KAT_FILE_OPEN_ERROR -1 +#define KAT_DATA_ERROR -3 +#define KAT_CRYPTO_FAILURE -4 + +#define MAX_FILE_NAME 256 +#define MAX_MESSAGE_LENGTH 32 +#define MAX_ASSOCIATED_DATA_LENGTH 32 + +void init_buffer(unsigned char *buffer, unsigned long long numbytes); + +void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length); + +int generate_test_vectors(); + +int main() +{ + int ret = generate_test_vectors(); + + if (ret != KAT_SUCCESS) { + fprintf(stderr, "test vector generation failed with code %d\n", ret); + } + + return ret; +} + +int generate_test_vectors() +{ + FILE *fp; + char fileName[MAX_FILE_NAME]; + unsigned char key[CRYPTO_KEYBYTES]; + unsigned char nonce[CRYPTO_NPUBBYTES]; + unsigned char msg[MAX_MESSAGE_LENGTH]; + unsigned char msg2[MAX_MESSAGE_LENGTH]; + unsigned char ad[MAX_ASSOCIATED_DATA_LENGTH]; + unsigned char ct[MAX_MESSAGE_LENGTH + CRYPTO_ABYTES]; + unsigned long long clen, mlen2; + int count = 1; + int func_ret, ret_val = KAT_SUCCESS; + + init_buffer(key, sizeof(key)); + init_buffer(nonce, sizeof(nonce)); + init_buffer(msg, sizeof(msg)); + init_buffer(ad, sizeof(ad)); + + sprintf(fileName, "LWC_AEAD_KAT_%d_%d.txt", (CRYPTO_KEYBYTES * 8), (CRYPTO_NPUBBYTES * 8)); + + if ((fp = fopen(fileName, "w")) == NULL) { + fprintf(stderr, "Couldn't open <%s> for write\n", fileName); + return KAT_FILE_OPEN_ERROR; + } + + for (unsigned long long mlen = 0; (mlen <= MAX_MESSAGE_LENGTH) && (ret_val == KAT_SUCCESS); mlen++) { + for (unsigned long long adlen = 0; adlen <= MAX_ASSOCIATED_DATA_LENGTH; adlen++) { + + printf("%0d\n", (int)clen); + + fprintf(fp, "Count = %d\n", count++); + printf("Count = %d\n", count - 1); + + fprint_bstr(fp, "Key = ", key, CRYPTO_KEYBYTES); + + fprint_bstr(fp, "Nonce = ", nonce, CRYPTO_NPUBBYTES); + + fprint_bstr(fp, "PT = ", msg, mlen); + + fprint_bstr(fp, "AD = ", ad, adlen); + + if ((func_ret = crypto_aead_encrypt(ct, &clen, msg, mlen, ad, adlen, NULL, nonce, key)) != 0) { + fprintf(fp, "crypto_aead_encrypt returned <%d>\n", func_ret); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + + fprint_bstr(fp, "CT = ", ct, clen); + + fprintf(fp, "\n"); + + if ((func_ret = crypto_aead_decrypt(msg2, &mlen2, NULL, ct, clen, ad, adlen, nonce, key)) != 0) { + fprintf(fp, "crypto_aead_decrypt returned <%d>\n", func_ret); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + + if (mlen != mlen2) { + fprintf(fp, "crypto_aead_decrypt returned bad 'mlen': Got <%llu>, expected <%llu>\n", mlen2, mlen); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + + if (memcmp(msg, msg2, mlen)) { + fprintf(fp, "crypto_aead_decrypt did not recover the plaintext\n"); + ret_val = KAT_CRYPTO_FAILURE; + break; + } + } + } + + fclose(fp); + + return ret_val; +} + + +void fprint_bstr(FILE *fp, const char *label, const unsigned char *data, unsigned long long length) +{ + fprintf(fp, "%s", label); + + for (unsigned long long i = 0; i < length; i++) + fprintf(fp, "%02X", data[i]); + + fprintf(fp, "\n"); +} + +void init_buffer(unsigned char *buffer, unsigned long long numbytes) +{ + for (unsigned long long i = 0; i < numbytes; i++) + buffer[i] = (unsigned char)i; +} diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny.h b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny.h new file mode 100644 index 0000000..6392b0f --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny.h @@ -0,0 +1,8 @@ +typedef struct ___skinny_ctrl { + unsigned char roundKeys[960]; // number of round : 56 + void (*func_skinny_128_384_enc)(unsigned char*, struct ___skinny_ctrl*, unsigned char* CNT, unsigned char* T, const unsigned char* K); +} skinny_ctrl; + +extern void skinny_128_384_enc123_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K); +extern void skinny_128_384_enc12_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K); +extern void skinny_128_384_enc1_1 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K); diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule2.c b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule2.c new file mode 100644 index 0000000..58006f2 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule2.c @@ -0,0 +1,3027 @@ +/****************************************************************************** + * Copyright (c) 2020, NEC Corporation. + * + * THIS CODE IS FURNISHED TO YOU "AS IS" WITHOUT WARRANTY OF ANY KIND. + * + *****************************************************************************/ + +/* + * SKINNY-128-384 + * + * load * AC(c0 c1) ^ TK3 + * calc AC(c0 c1) ^ TK2 -> store + * ART(TK2) + * + * number of rounds : 56 + */ + +__attribute__((aligned(4))) +void RunEncryptionKeyScheduleTK2(unsigned char *roundKeys) +{ + // r0 : points to roundKeys(& masterKey) + // r1-r4 : key state + // r5-r6 : temp use + // r7 : constant(0xfefefefe) + // r8 : constant(0x01010101) + // r9 : temp use + // r10 : temp use + asm volatile( + "stmdb sp!, {r4-r10} \n\t" + "ldr.w r1, [r0,#16] \n\t" // load master key + "ldr.w r2, [r0,#20] \n\t" // load master key + "ldr.w r3, [r0,#24] \n\t" // load master key + "ldr.w r4, [r0,#28] \n\t" // load master key + "mov.w r7, #0xfefefefe \n\t" + "mov.w r8, #0x01010101 \n\t" + + // round 1 + + "ldr.w r9, [r0,#512] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#516] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#64] \n\t" + "str.w r10, [r0,#68] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 2 + + "ldr.w r9, [r0,#520] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#524] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#72] \n\t" + "str.w r10, [r0,#76] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 3 + + "ldr.w r9, [r0,#528] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#532] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#80] \n\t" + "str.w r10, [r0,#84] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 4 + + "ldr.w r9, [r0,#536] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#540] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#88] \n\t" + "str.w r10, [r0,#92] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 5 + + "ldr.w r9, [r0,#544] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#548] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#96] \n\t" + "str.w r10, [r0,#100] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 6 + + "ldr.w r9, [r0,#552] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#556] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#104] \n\t" + "str.w r10, [r0,#108] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 7 + + "ldr.w r9, [r0,#560] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#564] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#112] \n\t" + "str.w r10, [r0,#116] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 8 + + "ldr.w r9, [r0,#568] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#572] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#120] \n\t" + "str.w r10, [r0,#124] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 9 + + "ldr.w r9, [r0,#576] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#580] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#128] \n\t" + "str.w r10, [r0,#132] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 10 + + "ldr.w r9, [r0,#584] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#588] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#136] \n\t" + "str.w r10, [r0,#140] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 11 + + "ldr.w r9, [r0,#592] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#596] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#144] \n\t" + "str.w r10, [r0,#148] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 12 + + "ldr.w r9, [r0,#600] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#604] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#152] \n\t" + "str.w r10, [r0,#156] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 13 + + "ldr.w r9, [r0,#608] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#612] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#160] \n\t" + "str.w r10, [r0,#164] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 14 + + "ldr.w r9, [r0,#616] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#620] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#168] \n\t" + "str.w r10, [r0,#172] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 15 + + "ldr.w r9, [r0,#624] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#628] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#176] \n\t" + "str.w r10, [r0,#180] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 16 + + "ldr.w r9, [r0,#632] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#636] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#184] \n\t" + "str.w r10, [r0,#188] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 17 + + "ldr.w r9, [r0,#640] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#644] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#192] \n\t" + "str.w r10, [r0,#196] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 18 + + "ldr.w r9, [r0,#648] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#652] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#200] \n\t" + "str.w r10, [r0,#204] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 19 + + "ldr.w r9, [r0,#656] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#660] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#208] \n\t" + "str.w r10, [r0,#212] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 20 + + "ldr.w r9, [r0,#664] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#668] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#216] \n\t" + "str.w r10, [r0,#220] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 21 + + "ldr.w r9, [r0,#672] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#676] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#224] \n\t" + "str.w r10, [r0,#228] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 22 + + "ldr.w r9, [r0,#680] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#684] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#232] \n\t" + "str.w r10, [r0,#236] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 23 + + "ldr.w r9, [r0,#688] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#692] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#240] \n\t" + "str.w r10, [r0,#244] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 24 + + "ldr.w r9, [r0,#696] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#700] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#248] \n\t" + "str.w r10, [r0,#252] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 25 + + "ldr.w r9, [r0,#704] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#708] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#256] \n\t" + "str.w r10, [r0,#260] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 26 + + "ldr.w r9, [r0,#712] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#716] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#264] \n\t" + "str.w r10, [r0,#268] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 27 + + "ldr.w r9, [r0,#720] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#724] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#272] \n\t" + "str.w r10, [r0,#276] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 28 + + "ldr.w r9, [r0,#728] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#732] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#280] \n\t" + "str.w r10, [r0,#284] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 29 + + "ldr.w r9, [r0,#736] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#740] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#288] \n\t" + "str.w r10, [r0,#292] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 30 + + "ldr.w r9, [r0,#744] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#748] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#296] \n\t" + "str.w r10, [r0,#300] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 31 + + "ldr.w r9, [r0,#752] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#756] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#304] \n\t" + "str.w r10, [r0,#308] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 32 + + "ldr.w r9, [r0,#760] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#764] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#312] \n\t" + "str.w r10, [r0,#316] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 33 + + "ldr.w r9, [r0,#768] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#772] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#320] \n\t" + "str.w r10, [r0,#324] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 34 + + "ldr.w r9, [r0,#776] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#780] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#328] \n\t" + "str.w r10, [r0,#332] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 35 + + "ldr.w r9, [r0,#784] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#788] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#336] \n\t" + "str.w r10, [r0,#340] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 36 + + "ldr.w r9, [r0,#792] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#796] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#344] \n\t" + "str.w r10, [r0,#348] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 37 + + "ldr.w r9, [r0,#800] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#804] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#352] \n\t" + "str.w r10, [r0,#356] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 38 + + "ldr.w r9, [r0,#808] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#812] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#360] \n\t" + "str.w r10, [r0,#364] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 39 + + "ldr.w r9, [r0,#816] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#820] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#368] \n\t" + "str.w r10, [r0,#372] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 40 + + "ldr.w r9, [r0,#824] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#828] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#376] \n\t" + "str.w r10, [r0,#380] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 41 + + "ldr.w r9, [r0,#832] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#836] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#384] \n\t" + "str.w r10, [r0,#388] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 42 + + "ldr.w r9, [r0,#840] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#844] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#392] \n\t" + "str.w r10, [r0,#396] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 43 + + "ldr.w r9, [r0,#848] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#852] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#400] \n\t" + "str.w r10, [r0,#404] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 44 + + "ldr.w r9, [r0,#856] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#860] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#408] \n\t" + "str.w r10, [r0,#412] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 45 + + "ldr.w r9, [r0,#864] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#868] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#416] \n\t" + "str.w r10, [r0,#420] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 46 + + "ldr.w r9, [r0,#872] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#876] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#424] \n\t" + "str.w r10, [r0,#428] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 47 + + "ldr.w r9, [r0,#880] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#884] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#432] \n\t" + "str.w r10, [r0,#436] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 48 + + "ldr.w r9, [r0,#888] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#892] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#440] \n\t" + "str.w r10, [r0,#444] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 49 + + "ldr.w r9, [r0,#896] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#900] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#448] \n\t" + "str.w r10, [r0,#452] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 50 + + "ldr.w r9, [r0,#904] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#908] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#456] \n\t" + "str.w r10, [r0,#460] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 51 + + "ldr.w r9, [r0,#912] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#916] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#464] \n\t" + "str.w r10, [r0,#468] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 52 + + "ldr.w r9, [r0,#920] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#924] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#472] \n\t" + "str.w r10, [r0,#476] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 53 + + "ldr.w r9, [r0,#928] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#932] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#480] \n\t" + "str.w r10, [r0,#484] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 54 + + "ldr.w r9, [r0,#936] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#940] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#488] \n\t" + "str.w r10, [r0,#492] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 55 + + "ldr.w r9, [r0,#944] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#948] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#496] \n\t" + "str.w r10, [r0,#500] \n\t" + + // permutation + // r1 (k3 k2 k1 k0) k13 k8 k15 k9 + // r2 (k7 k6 k5 k4) k11 k12 k14 k10 + // r3 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r4 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r5, r3 \n\t" // r5(k11 k10 k9 k8 ) + "mov r6, r4 \n\t" // r6(k15 k14 k13 k12) + "mov r3, r1 \n\t" // r3(k3 k2 k1 k0) + "mov r4, r2 \n\t" // r4(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r1,r6, #16 \n\t" // r1(k13 k12 k15 k14) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 k14) + "pkhtb.w r2,r5, r5, asr #16 \n\t" // r2(k11 k10 k11 k10) + "ror.w r5, #8 \n\t" // r5( k8 k11 k10 k8) + "bfi.w r1,r5, #0,#8 \n\t" // r1(k13 k8 k15 k9) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k11 k10) + "ror.w r6,#16 \n\t" // r6(k13 k12 k15 k14) + "bfi.w r2,r6, #8,#8 \n\t" // r2(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r1, r6 \n\t" // r1(k12 k13 k14 k15) + "lsl.w r1, r1, #8 \n\t" // r1(k13 k14 k15 --) + "bfi.w r1,r5, #16,#8 \n\t" // r1(k13 k8 k15 --) + "lsr.w r5, r5, #8 \n\t" // r5( -- k11 k10 k9) + "bfi.w r1,r5, #0, #8 \n\t" // r1(k13 k8 k15 k9) + "rev16.w r2, r5 \n\t" // r2(k11 -- k9 k10) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k11 k12 k9 k10) + "lsr.w r6, r6, #16 \n\t" // r6(-- -- k15 k14) + "bfi.w r2,r6, #8, #8 \n\t" // r2(k11 k12 k14 k10) +#endif + // LFSR(for TK2) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x6 x5 x4 x3 x2 x1 x0 x7^x6) + "and.w r5, r7, r1, lsl #1 \n\t" + "and.w r6, r8, r1, lsr #7 \n\t" + "and.w r1, r8, r1, lsr #5 \n\t" + "eor.w r1, r6 \n\t" + "eor.w r1, r5 \n\t" + + "and.w r5, r7, r2, lsl #1 \n\t" + "and.w r6, r8, r2, lsr #7 \n\t" + "and.w r2, r8, r2, lsr #5 \n\t" + "eor.w r2, r6 \n\t" + "eor.w r2, r5 \n\t" + + // round 56 + + "ldr.w r9, [r0,#952] \n\t" // load TK3 ^ AC(c0 c1) + "ldr.w r10, [r0,#956] \n\t" // load TK3 ^ AC(c0 c1) + + "eor.w r9, r1 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r10, r2 \n\t" // TK2 ^ TK3 ^ AC(c0 c1) + + // round key store((TK2 ^ TK3 ^ AC(c0 c1)) + "str.w r9, [r0,#504] \n\t" + "str.w r10, [r0,#508] \n\t" + + // permutation + + // not need to calculate (not used) + + "ldmia.w sp!, {r4-r10} \n\t" + : + : [roundKeys] "r" (roundKeys) + : "cc"); +} diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule3.c b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule3.c new file mode 100644 index 0000000..81c4406 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_key_schedule3.c @@ -0,0 +1,2898 @@ +/****************************************************************************** + * Copyright (c) 2020, NEC Corporation. + * + * THIS CODE IS FURNISHED TO YOU "AS IS" WITHOUT WARRANTY OF ANY KIND. + * + *****************************************************************************/ + +/* + * SKINNY-128-384 + * + * AC(c0 c1) ^ TK3 -> store + * ART(TK3) + * + * number of rounds : 56 + */ + +__attribute__((aligned(4))) +void RunEncryptionKeyScheduleTK3(unsigned char *roundKeys, unsigned char *pRC) +{ + // r0 : points to roundKeys(& masterKey) + // r1 : points to RC + // r2-r5 : key state + // r6-r7 : temp use + // r8 : constant(0x7f7f7f7f) + // r9 : constant(0x80808080) + asm volatile( + "stmdb sp!, {r4-r9} \n\t" + "ldr.w r2, [r0,#32] \n\t" // load master key + "ldr.w r3, [r0,#36] \n\t" // load master key + "ldr.w r4, [r0,#40] \n\t" // load master key + "ldr.w r5, [r0,#44] \n\t" // load master key + "mov.w r8, #0x7f7f7f7f \n\t" + "mov.w r9, #0x80808080 \n\t" + + // round 1 + + // AC(c0 c1) + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#512] \n\t" + "str.w r3, [r0,#516] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 2 + + // AC(c0 c1) + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#520] \n\t" + "str.w r3, [r0,#524] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 3 + + // AC(c0 c1) + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#528] \n\t" + "str.w r3, [r0,#532] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 4 + + // AC(c0 c1) + "eor.w r6, r2, #0xf \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#536] \n\t" + "str.w r3, [r0,#540] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 5 + + // AC(c0 c1) + "eor.w r6, r2, #0xf \n\t" // k0^rc + "eor.w r7, r3, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#544] \n\t" + "str.w r7, [r0,#548] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 6 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#552] \n\t" + "str.w r7, [r0,#556] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 7 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xd \n\t" // k0^rc + + + // round key store + "str.w r6, [r0,#560] \n\t" + "str.w r7, [r0,#564] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 8 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#568] \n\t" + "str.w r7, [r0,#572] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 9 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#576] \n\t" + "str.w r7, [r0,#580] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 10 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xf \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#584] \n\t" + "str.w r7, [r0,#588] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 11 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#592] \n\t" + "str.w r7, [r0,#596] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 12 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#600] \n\t" + "str.w r7, [r0,#604] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 13 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#608] \n\t" + "str.w r7, [r0,#612] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 14 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#616] \n\t" + "str.w r7, [r0,#620] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 15 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#624] \n\t" + "str.w r7, [r0,#628] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 16 + + // AC(c0 c1) + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#632] \n\t" + "str.w r3, [r0,#636] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 17 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xd \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#640] \n\t" + "str.w r7, [r0,#644] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 18 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0xa \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#648] \n\t" + "str.w r7, [r0,#652] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 19 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x5 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#656] \n\t" + "str.w r7, [r0,#660] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 20 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#664] \n\t" + "str.w r7, [r0,#668] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 21 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#672] \n\t" + "str.w r7, [r0,#676] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 22 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#680] \n\t" + "str.w r7, [r0,#684] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 23 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x8 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#688] \n\t" + "str.w r7, [r0,#692] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 24 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x0 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#696] \n\t" + "str.w r7, [r0,#700] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 25 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#704] \n\t" + "str.w r7, [r0,#708] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 26 + + // AC(c0 c1) + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#712] \n\t" + "str.w r3, [r0,#716] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 27 + + // AC(c0 c1) + "eor.w r6, r2, #0x5 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#720] \n\t" + "str.w r3, [r0,#724] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 28 + + // AC(c0 c1) + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#728] \n\t" + "str.w r3, [r0,#732] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 29 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x7 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#736] \n\t" + "str.w r7, [r0,#740] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 30 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xe \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#744] \n\t" + "str.w r7, [r0,#748] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 31 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#752] \n\t" + "str.w r7, [r0,#756] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 32 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x8 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#760] \n\t" + "str.w r7, [r0,#764] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 33 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#768] \n\t" + "str.w r7, [r0,#772] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 34 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#776] \n\t" + "str.w r7, [r0,#780] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 35 + + // AC(c0 c1) + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#784] \n\t" + "str.w r3, [r0,#788] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 36 + + // AC(c0 c1) + "eor.w r6, r2, #0xd \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#792] \n\t" + "str.w r3, [r0,#796] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 37 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xb \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#800] \n\t" + "str.w r7, [r0,#804] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 38 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#808] \n\t" + "str.w r7, [r0,#812] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 39 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0xd \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#816] \n\t" + "str.w r7, [r0,#820] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 40 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0xa \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#824] \n\t" + "str.w r7, [r0,#828] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 41 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x4 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#832] \n\t" + "str.w r7, [r0,#836] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 42 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#840] \n\t" + "str.w r7, [r0,#844] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 43 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#848] \n\t" + "str.w r7, [r0,#852] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 44 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x4 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#856] \n\t" + "str.w r7, [r0,#860] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 45 + + // AC(c0 c1) + "eor.w r6, r2, #0x8 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#864] \n\t" + "str.w r3, [r0,#868] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 46 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x1 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#872] \n\t" + "str.w r7, [r0,#876] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 47 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#880] \n\t" + "str.w r7, [r0,#884] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 48 + + // AC(c0 c1) + "eor.w r6, r2, #0x4 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#888] \n\t" + "str.w r3, [r0,#892] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 49 + + // AC(c0 c1) + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#896] \n\t" + "str.w r3, [r0,#900] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 50 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x3 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#904] \n\t" + "str.w r7, [r0,#908] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 51 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x6 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#912] \n\t" + "str.w r7, [r0,#916] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 52 + + // AC(c0 c1) + "eor.w r6, r2, #0xc \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#920] \n\t" + "str.w r3, [r0,#924] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 53 + + // AC(c0 c1) + "eor.w r7, r3, #0x1 \n\t" // k0^rc + "eor.w r6, r2, #0x9 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#928] \n\t" + "str.w r7, [r0,#932] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 54 + + // AC(c0 c1) + "eor.w r7, r3, #0x3 \n\t" // k0^rc + "eor.w r6, r2, #0x2 \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#936] \n\t" + "str.w r7, [r0,#940] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 55 + + // AC(c0 c1) + "eor.w r7, r3, #0x2 \n\t" // k0^rc + "eor.w r6, r2, #0x5 \n\t" // k0^rc + // round key store + "str.w r6, [r0,#944] \n\t" + "str.w r7, [r0,#948] \n\t" + + // permutation + // r2 (k3 k2 k1 k0) k13 k8 k15 k9 + // r3 (k7 k6 k5 k4) k11 k12 k14 k10 + // r4 (k11 k10 k9 k8) ------> k3 k2 k1 k0 + // r5 (k15 k14 k13 k12) k7 k6 k5 k4 + "mov r6, r4 \n\t" // r6(k11 k10 k9 k8 ) + "mov r7, r5 \n\t" // r7(k15 k14 k13 k12) + "mov r4, r2 \n\t" // r4(k3 k2 k1 k0) + "mov r5, r3 \n\t" // r5(k7 k6 k5 k4) +#ifdef STM32F4 // for Cortex-M4 + "ror.w r2,r7, #16 \n\t" // r2(k13 k12 k15 k14) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 k14) + "pkhtb.w r3,r6, r6, asr #16 \n\t" // r3(k11 k10 k11 k10) + "ror.w r6, #8 \n\t" // r6( k8 k11 k10 k8) + "bfi.w r2,r6, #0,#8 \n\t" // r2(k13 k8 k15 k9) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k11 k10) + "ror.w r7,#16 \n\t" // r7(k13 k12 k15 k14) + "bfi.w r3,r7, #8,#8 \n\t" // r3(k11 k12 k14 k10) +#else // for Cortex-M3 + "rev.w r2, r7 \n\t" // r2(k12 k13 k14 k15) + "lsl.w r2, r2, #8 \n\t" // r2(k13 k14 k15 --) + "bfi.w r2,r6, #16,#8 \n\t" // r2(k13 k8 k15 --) + "lsr.w r6, r6, #8 \n\t" // r6( -- k11 k10 k9) + "bfi.w r2,r6, #0, #8 \n\t" // r2(k13 k8 k15 k9) + "rev16.w r3, r6 \n\t" // r3(k11 -- k9 k10) + "bfi.w r3,r7, #16,#8 \n\t" // r3(k11 k12 k9 k10) + "lsr.w r7, r7, #16 \n\t" // r7 (-- -- k15 k14) + "bfi.w r3,r7, #8, #8 \n\t" // r3(k11 k12 k14 k10) +#endif + // LFSR(for TK3) (x7 x6 x5 x4 x3 x2 x1 x0) -> (x0^x6 x7 x6 x5 x4 x2 x2 x1) + "and.w r6, r8, r2, lsr #1 \n\t" + "and.w r7, r9, r2, lsl #7 \n\t" + "and.w r2, r9, r2, lsl #1 \n\t" + "eor.w r2, r7 \n\t" + "eor.w r2, r6 \n\t" + + "and.w r6, r8, r3, lsr #1 \n\t" + "and.w r7, r9, r3, lsl #7 \n\t" + "and.w r3, r9, r3, lsl #1 \n\t" + "eor.w r3, r7 \n\t" + "eor.w r3, r6 \n\t" + + // round 56 + + // AC(c0 c1) + "eor.w r6, r2, #0xa \n\t" // k0^rc + + // round key store + "str.w r6, [r0,#952] \n\t" + "str.w r3, [r0,#956] \n\t" + + // permutation + + // not need to calculate (not used) + + "ldmia.w sp!, {r4-r9} \n\t" + : + : [roundKeys] "r" (roundKeys), [pRC] "r" (pRC) + : "cc"); +} diff --git a/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_main.c b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_main.c new file mode 100644 index 0000000..217b8a6 --- /dev/null +++ b/romulus/Implementations/crypto_aead/romulusn1v12/armsrc/skinny_main.c @@ -0,0 +1,4687 @@ +/****************************************************************************** + * Copyright (c) 2020, NEC Corporation. + * + * THIS CODE IS FURNISHED TO YOU "AS IS" WITHOUT WARRANTY OF ANY KIND. + * + *****************************************************************************/ + +/* + * SKINNY-128-384 + * + * ART(TK1) -> store + * load AC(c0 c1) ^ TK3 ^ TK2 + * load TK1 + * calc AC(c0 c1) ^ TK3 ^ TK2 ^ TK1 -> use at (AC->ART) + * SC->SR->(AC->ART)->MC + * + * number of rounds : 56 + */ + +#include "skinny.h" + +/* + * S-BOX + */ +unsigned char SBOX[512] += { + // Original + 0x65, 0x4c, 0x6a, 0x42, 0x4b, 0x63, 0x43, 0x6b, 0x55, 0x75, 0x5a, 0x7a, 0x53, 0x73, 0x5b, 0x7b, + 0x35, 0x8c, 0x3a, 0x81, 0x89, 0x33, 0x80, 0x3b, 0x95, 0x25, 0x98, 0x2a, 0x90, 0x23, 0x99, 0x2b, + 0xe5, 0xcc, 0xe8, 0xc1, 0xc9, 0xe0, 0xc0, 0xe9, 0xd5, 0xf5, 0xd8, 0xf8, 0xd0, 0xf0, 0xd9, 0xf9, + 0xa5, 0x1c, 0xa8, 0x12, 0x1b, 0xa0, 0x13, 0xa9, 0x05, 0xb5, 0x0a, 0xb8, 0x03, 0xb0, 0x0b, 0xb9, + 0x32, 0x88, 0x3c, 0x85, 0x8d, 0x34, 0x84, 0x3d, 0x91, 0x22, 0x9c, 0x2c, 0x94, 0x24, 0x9d, 0x2d, + 0x62, 0x4a, 0x6c, 0x45, 0x4d, 0x64, 0x44, 0x6d, 0x52, 0x72, 0x5c, 0x7c, 0x54, 0x74, 0x5d, 0x7d, + 0xa1, 0x1a, 0xac, 0x15, 0x1d, 0xa4, 0x14, 0xad, 0x02, 0xb1, 0x0c, 0xbc, 0x04, 0xb4, 0x0d, 0xbd, + 0xe1, 0xc8, 0xec, 0xc5, 0xcd, 0xe4, 0xc4, 0xed, 0xd1, 0xf1, 0xdc, 0xfc, 0xd4, 0xf4, 0xdd, 0xfd, + 0x36, 0x8e, 0x38, 0x82, 0x8b, 0x30, 0x83, 0x39, 0x96, 0x26, 0x9a, 0x28, 0x93, 0x20, 0x9b, 0x29, + 0x66, 0x4e, 0x68, 0x41, 0x49, 0x60, 0x40, 0x69, 0x56, 0x76, 0x58, 0x78, 0x50, 0x70, 0x59, 0x79, + 0xa6, 0x1e, 0xaa, 0x11, 0x19, 0xa3, 0x10, 0xab, 0x06, 0xb6, 0x08, 0xba, 0x00, 0xb3, 0x09, 0xbb, + 0xe6, 0xce, 0xea, 0xc2, 0xcb, 0xe3, 0xc3, 0xeb, 0xd6, 0xf6, 0xda, 0xfa, 0xd3, 0xf3, 0xdb, 0xfb, + 0x31, 0x8a, 0x3e, 0x86, 0x8f, 0x37, 0x87, 0x3f, 0x92, 0x21, 0x9e, 0x2e, 0x97, 0x27, 0x9f, 0x2f, + 0x61, 0x48, 0x6e, 0x46, 0x4f, 0x67, 0x47, 0x6f, 0x51, 0x71, 0x5e, 0x7e, 0x57, 0x77, 0x5f, 0x7f, + 0xa2, 0x18, 0xae, 0x16, 0x1f, 0xa7, 0x17, 0xaf, 0x01, 0xb2, 0x0e, 0xbe, 0x07, 0xb7, 0x0f, 0xbf, + 0xe2, 0xca, 0xee, 0xc6, 0xcf, 0xe7, 0xc7, 0xef, 0xd2, 0xf2, 0xde, 0xfe, 0xd7, 0xf7, 0xdf, 0xff, + + // Original ^ c2(0x02) + 0x67, 0x4e, 0x68, 0x40, 0x49, 0x61, 0x41, 0x69, 0x57, 0x77, 0x58, 0x78, 0x51, 0x71, 0x59, 0x79, + 0x37, 0x8e, 0x38, 0x83, 0x8b, 0x31, 0x82, 0x39, 0x97, 0x27, 0x9a, 0x28, 0x92, 0x21, 0x9b, 0x29, + 0xe7, 0xce, 0xea, 0xc3, 0xcb, 0xe2, 0xc2, 0xeb, 0xd7, 0xf7, 0xda, 0xfa, 0xd2, 0xf2, 0xdb, 0xfb, + 0xa7, 0x1e, 0xaa, 0x10, 0x19, 0xa2, 0x11, 0xab, 0x07, 0xb7, 0x08, 0xba, 0x01, 0xb2, 0x09, 0xbb, + 0x30, 0x8a, 0x3e, 0x87, 0x8f, 0x36, 0x86, 0x3f, 0x93, 0x20, 0x9e, 0x2e, 0x96, 0x26, 0x9f, 0x2f, + 0x60, 0x48, 0x6e, 0x47, 0x4f, 0x66, 0x46, 0x6f, 0x50, 0x70, 0x5e, 0x7e, 0x56, 0x76, 0x5f, 0x7f, + 0xa3, 0x18, 0xae, 0x17, 0x1f, 0xa6, 0x16, 0xaf, 0x00, 0xb3, 0x0e, 0xbe, 0x06, 0xb6, 0x0f, 0xbf, + 0xe3, 0xca, 0xee, 0xc7, 0xcf, 0xe6, 0xc6, 0xef, 0xd3, 0xf3, 0xde, 0xfe, 0xd6, 0xf6, 0xdf, 0xff, + 0x34, 0x8c, 0x3a, 0x80, 0x89, 0x32, 0x81, 0x3b, 0x94, 0x24, 0x98, 0x2a, 0x91, 0x22, 0x99, 0x2b, + 0x64, 0x4c, 0x6a, 0x43, 0x4b, 0x62, 0x42, 0x6b, 0x54, 0x74, 0x5a, 0x7a, 0x52, 0x72, 0x5b, 0x7b, + 0xa4, 0x1c, 0xa8, 0x13, 0x1b, 0xa1, 0x12, 0xa9, 0x04, 0xb4, 0x0a, 0xb8, 0x02, 0xb1, 0x0b, 0xb9, + 0xe4, 0xcc, 0xe8, 0xc0, 0xc9, 0xe1, 0xc1, 0xe9, 0xd4, 0xf4, 0xd8, 0xf8, 0xd1, 0xf1, 0xd9, 0xf9, + 0x33, 0x88, 0x3c, 0x84, 0x8d, 0x35, 0x85, 0x3d, 0x90, 0x23, 0x9c, 0x2c, 0x95, 0x25, 0x9d, 0x2d, + 0x63, 0x4a, 0x6c, 0x44, 0x4d, 0x65, 0x45, 0x6d, 0x53, 0x73, 0x5c, 0x7c, 0x55, 0x75, 0x5d, 0x7d, + 0xa0, 0x1a, 0xac, 0x14, 0x1d, 0xa5, 0x15, 0xad, 0x03, 0xb0, 0x0c, 0xbc, 0x05, 0xb5, 0x0d, 0xbd, + 0xe0, 0xc8, 0xec, 0xc4, 0xcd, 0xe5, 0xc5, 0xed, 0xd0, 0xf0, 0xdc, 0xfc, 0xd5, 0xf5, 0xdd, 0xfd, +}; + +/* + * Round Constants + */ +unsigned char RC[56] += { + 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3E, 0x3D, 0x3B, + 0x37, 0x2F, 0x1E, 0x3C, 0x39, 0x33, 0x27, 0x0E, + 0x1D, 0x3A, 0x35, 0x2B, 0x16, 0x2C, 0x18, 0x30, + 0x21, 0x02, 0x05, 0x0B, 0x17, 0x2E, 0x1C, 0x38, + 0x31, 0x23, 0x06, 0x0D, 0x1B, 0x36, 0x2D, 0x1A, + 0x34, 0x29, 0x12, 0x24, 0x08, 0x11, 0x22, 0x04, + 0x09, 0x13, 0x26, 0x0c, 0x19, 0x32, 0x25, 0x0a,}; + +extern void Encrypt(unsigned char *block, unsigned char *roundKeys, unsigned char *pSBOX) __attribute__((noinline)); +extern void RunEncryptionKeyScheduleTK2(unsigned char *roundKeys) __attribute__((noinline)); +extern void RunEncryptionKeyScheduleTK3(unsigned char *roundKeys, unsigned char *pRC) __attribute__((noinline)); + +void skinny_128_384_enc123_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K) +{ + *((unsigned int *)&pskinny_ctrl->roundKeys[0] ) = *((unsigned int *)&CNT[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[4] ) = *((unsigned int *)&CNT[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[16]) = *((unsigned int *)&T[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[20]) = *((unsigned int *)&T[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[24]) = *((unsigned int *)&T[8]); + *((unsigned int *)&pskinny_ctrl->roundKeys[28]) = *((unsigned int *)&T[12]); + *((unsigned int *)&pskinny_ctrl->roundKeys[32]) = *((unsigned int *)&K[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[36]) = *((unsigned int *)&K[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[40]) = *((unsigned int *)&K[8]); + *((unsigned int *)&pskinny_ctrl->roundKeys[44]) = *((unsigned int *)&K[12]); + + RunEncryptionKeyScheduleTK3(pskinny_ctrl->roundKeys, RC); + RunEncryptionKeyScheduleTK2(pskinny_ctrl->roundKeys); + Encrypt(input, pskinny_ctrl->roundKeys, SBOX); + + pskinny_ctrl->func_skinny_128_384_enc = skinny_128_384_enc12_12; +} + +void skinny_128_384_enc12_12 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K) +{ + (void)K; + + *((unsigned int *)&pskinny_ctrl->roundKeys[0] ) = *((unsigned int *)&CNT[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[4] ) = *((unsigned int *)&CNT[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[16]) = *((unsigned int *)&T[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[20]) = *((unsigned int *)&T[4]); + *((unsigned int *)&pskinny_ctrl->roundKeys[24]) = *((unsigned int *)&T[8]); + *((unsigned int *)&pskinny_ctrl->roundKeys[28]) = *((unsigned int *)&T[12]); + + RunEncryptionKeyScheduleTK2(pskinny_ctrl->roundKeys); + Encrypt(input, pskinny_ctrl->roundKeys, SBOX); +} + +extern void skinny_128_384_enc1_1 (unsigned char* input, skinny_ctrl* pskinny_ctrl, unsigned char* CNT, unsigned char* T, const unsigned char* K) +{ + (void)T; + (void)K; + + *((unsigned int *)&pskinny_ctrl->roundKeys[0] ) = *((unsigned int *)&CNT[0]); + *((unsigned int *)&pskinny_ctrl->roundKeys[4] ) = *((unsigned int *)&CNT[4]); + + Encrypt(input, pskinny_ctrl->roundKeys, SBOX); +} + +__attribute__((aligned(4))) +void Encrypt(unsigned char *block, unsigned char *roundKeys, unsigned char *pSBOX) +{ + // r0 : ponits to plaintext + // r1 : points to roundKeys(& masterKey) + // r2 : points to SBOX + // r3-r6 : cipher state + // r7-r12: temp use + // r14 : temp use + asm volatile( + "stmdb sp!, {r4-r12,r14} \n\t" + "stmdb.w sp!, {r0} \n\t" // push store pointer + +// ART(TK1) + + "ldm.w r1, {r3-r4} \n\t" // load master key + + // round 1-2 + +// // round key store(do not need) +// "str.w r3, [r1,#0] \n\t" +// "str.w r4, [r1,#4] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 3-4 + + // round key store + "str.w r5, [r1,#8] \n\t" + "str.w r6, [r1,#12] \n\t" + + // premutation + + // r3 (--- --- --- ---) k5 k0 k7 k1 + // r4 (--- --- --- ---) k3 k4 k6 k2 + // r5 ( k3 k2 k1 k0) -----> --- --- --- --- + // r6 ( k7 k6 k5 k4) --- --- --- --- +#ifdef STM32F4 // for Cortex-M4 + "ror.w r3,r6, #16 \n\t" // r3( k5 k4 k7 k6) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 k6) + "pkhtb.w r4,r5, r5, asr #16 \n\t" // r4( k3 k2 k3 k2) + "ror.w r5, #8 \n\t" // r5( k0 k3 k2 k1) + "bfi.w r3,r5, #0,#8 \n\t" // r3( k5 k4 k2 k6) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k3 k2) + "ror.w r6,#16 \n\t" // r6( k5 k4 k7 k6) + "bfi.w r4,r6, #8,#8 \n\t" // r4( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r3, r6 \n\t" // r3( k4 k5 k6 k7) + "lsl.w r3, r3, #8 \n\t" // r3( k5 k6 k7 ---) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 ---) + "lsr.w r5, r5, #8 \n\t" // r5(--- k3 k2 k1) + "bfi.w r3,r5, #0, #8 \n\t" // r3( k5 k0 k7 k1) + "rev16.w r4, r5 \n\t" // r4( k3 --- k1 k2) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k1 k2) + "lsr.w r6, r6, #16 \n\t" // r6(-- --- k7 k6) + "bfi.w r4,r6, #8, #8 \n\t" // r4( k3 k4 k6 k2) +#endif + + // round 5-6 + + // round key store + "str.w r3, [r1,#16] \n\t" + "str.w r4, [r1,#20] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 7-8 + + // round key store + "str.w r5, [r1,#24] \n\t" + "str.w r6, [r1,#28] \n\t" + + // premutation + + // r3 (--- --- --- ---) k5 k0 k7 k1 + // r4 (--- --- --- ---) k3 k4 k6 k2 + // r5 ( k3 k2 k1 k0) -----> --- --- --- --- + // r6 ( k7 k6 k5 k4) --- --- --- --- +#ifdef STM32F4 // for Cortex-M4 + "ror.w r3,r6, #16 \n\t" // r3( k5 k4 k7 k6) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 k6) + "pkhtb.w r4,r5, r5, asr #16 \n\t" // r4( k3 k2 k3 k2) + "ror.w r5, #8 \n\t" // r5( k0 k3 k2 k1) + "bfi.w r3,r5, #0,#8 \n\t" // r3( k5 k4 k2 k6) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k3 k2) + "ror.w r6,#16 \n\t" // r6( k5 k4 k7 k6) + "bfi.w r4,r6, #8,#8 \n\t" // r4( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r3, r6 \n\t" // r3( k4 k5 k6 k7) + "lsl.w r3, r3, #8 \n\t" // r3( k5 k6 k7 ---) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 ---) + "lsr.w r5, r5, #8 \n\t" // r5(--- k3 k2 k1) + "bfi.w r3,r5, #0, #8 \n\t" // r3( k5 k0 k7 k1) + "rev16.w r4, r5 \n\t" // r4( k3 --- k1 k2) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k1 k2) + "lsr.w r6, r6, #16 \n\t" // r6(-- --- k7 k6) + "bfi.w r4,r6, #8, #8 \n\t" // r4( k3 k4 k6 k2) +#endif + + // round 9-10 + + // round key store + "str.w r3, [r1,#32] \n\t" + "str.w r4, [r1,#36] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 11-12 + + // round key store + "str.w r5, [r1,#40] \n\t" + "str.w r6, [r1,#44] \n\t" + + // premutation + + // r3 (--- --- --- ---) k5 k0 k7 k1 + // r4 (--- --- --- ---) k3 k4 k6 k2 + // r5 ( k3 k2 k1 k0) -----> --- --- --- --- + // r6 ( k7 k6 k5 k4) --- --- --- --- +#ifdef STM32F4 // for Cortex-M4 + "ror.w r3,r6, #16 \n\t" // r3( k5 k4 k7 k6) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 k6) + "pkhtb.w r4,r5, r5, asr #16 \n\t" // r4( k3 k2 k3 k2) + "ror.w r5, #8 \n\t" // r5( k0 k3 k2 k1) + "bfi.w r3,r5, #0,#8 \n\t" // r3( k5 k4 k2 k6) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k3 k2) + "ror.w r6,#16 \n\t" // r6( k5 k4 k7 k6) + "bfi.w r4,r6, #8,#8 \n\t" // r4( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r3, r6 \n\t" // r3( k4 k5 k6 k7) + "lsl.w r3, r3, #8 \n\t" // r3( k5 k6 k7 ---) + "bfi.w r3,r5, #16,#8 \n\t" // r3( k5 k0 k7 ---) + "lsr.w r5, r5, #8 \n\t" // r5(--- k3 k2 k1) + "bfi.w r3,r5, #0, #8 \n\t" // r3( k5 k0 k7 k1) + "rev16.w r4, r5 \n\t" // r4( k3 --- k1 k2) + "bfi.w r4,r6, #16,#8 \n\t" // r4( k3 k4 k1 k2) + "lsr.w r6, r6, #16 \n\t" // r6(-- --- k7 k6) + "bfi.w r4,r6, #8, #8 \n\t" // r4( k3 k4 k6 k2) +#endif + + // round 13-14 + + // round key store + "str.w r3, [r1,#48] \n\t" + "str.w r4, [r1,#52] \n\t" + + // premutation + + // r3 ( k3 k2 k1 k0) --- --- --- --- + // r4 ( k7 k6 k5 k4) --- --- --- --- + // r5 (--- --- --- ---) -----> k5 k0 k7 k1 + // r6 (--- --- --- ---) k3 k4 k6 k2 +#ifdef STM32F4 // for Cortex-M4 + "ror.w r5,r4, #16 \n\t" // r5( k5 k4 k7 k6) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 k6) + "pkhtb.w r6,r3, r3, asr #16 \n\t" // r6( k3 k2 k3 k2) + "ror.w r3, #8 \n\t" // r3( k0 k3 k2 k1) + "bfi.w r5,r3, #0,#8 \n\t" // r5( k5 k4 k2 k6) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k3 k2) + "ror.w r4,#16 \n\t" // r4( k5 k4 k7 k6) + "bfi.w r6,r4, #8,#8 \n\t" // r6( k3 k4 k6 k2) +#else // for Cortex-M3 + "rev.w r5, r4 \n\t" // r5( k4 k5 k6 k7) + "lsl.w r5, r5, #8 \n\t" // r5( k5 k6 k7 ---) + "bfi.w r5,r3, #16,#8 \n\t" // r5( k5 k0 k7 ---) + "lsr.w r3, r3, #8 \n\t" // r3(--- k3 k2 k1) + "bfi.w r5,r3, #0, #8 \n\t" // r5( k5 k0 k7 k1) + "rev16.w r6, r3 \n\t" // r6( k3 --- k1 k2) + "bfi.w r6,r4, #16,#8 \n\t" // r6( k3 k4 k1 k2) + "lsr.w r4, r4, #16 \n\t" // r4(-- --- k7 k6) + "bfi.w r6,r4, #8, #8 \n\t" // r6( k3 k4 k6 k2) +#endif + // round 15-16 + + // round key store + "str.w r5, [r1,#56] \n\t" + "str.w r6, [r1,#60] \n\t" + + // premutation + + // not need to calculate (not used) + +// SC->(AC->ART)->SR->MC + + "add.w r14, r2, #256 \n\t" // point to SBOX ^ c2(0x02) + + "ldm.w r0, {r3-r6} \n\t" // load plaintext + // r0 now free to overwrite + + // round 1 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#64] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#68] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 2 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#72] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#76] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 3 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#80] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#84] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 4 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#88] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#92] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 5 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#96] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#100] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 6 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#104] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#108] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 7 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#112] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#116] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 8 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#120] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#124] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 9 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#32] \n\t" // load TK1 + "ldr.w r10, [r1,#36] \n\t" // load TK1 + "ldr.w r11, [r1,#128] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#132] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 10 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#136] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#140] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 11 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#40] \n\t" // load TK1 + "ldr.w r10, [r1,#44] \n\t" // load TK1 + "ldr.w r11, [r1,#144] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#148] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 12 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#152] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#156] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 13 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#48] \n\t" // load TK1 + "ldr.w r10, [r1,#52] \n\t" // load TK1 + "ldr.w r11, [r1,#160] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#164] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 14 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#168] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#172] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 15 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#56] \n\t" // load TK1 + "ldr.w r10, [r1,#60] \n\t" // load TK1 + "ldr.w r11, [r1,#176] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#180] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 16 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#184] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#188] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 17 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#192] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#196] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 18 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#200] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#204] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 19 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#208] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#212] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 20 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#216] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#220] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 21 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#224] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#228] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 22 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#232] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#236] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 23 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#240] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#244] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 24 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#248] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#252] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 25 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#32] \n\t" // load TK1 + "ldr.w r10, [r1,#36] \n\t" // load TK1 + "ldr.w r11, [r1,#256] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#260] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 26 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#264] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#268] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 27 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#40] \n\t" // load TK1 + "ldr.w r10, [r1,#44] \n\t" // load TK1 + "ldr.w r11, [r1,#272] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#276] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 28 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#280] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#284] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 29 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#48] \n\t" // load TK1 + "ldr.w r10, [r1,#52] \n\t" // load TK1 + "ldr.w r11, [r1,#288] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#292] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 30 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#296] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#300] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 31 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#56] \n\t" // load TK1 + "ldr.w r10, [r1,#60] \n\t" // load TK1 + "ldr.w r11, [r1,#304] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#308] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 32 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#312] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#316] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 33 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#320] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#324] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 34 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#328] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#332] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 35 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#336] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#340] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 36 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#344] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#348] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 37 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#352] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#356] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 38 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#360] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#364] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 39 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#368] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#372] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 40 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#376] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#380] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + + // round 41 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#32] \n\t" // load TK1 + "ldr.w r10, [r1,#36] \n\t" // load TK1 + "ldr.w r11, [r1,#384] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#388] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 42 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#392] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#396] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 43 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#40] \n\t" // load TK1 + "ldr.w r10, [r1,#44] \n\t" // load TK1 + "ldr.w r11, [r1,#400] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#404] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 44 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#408] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#412] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 45 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#48] \n\t" // load TK1 + "ldr.w r10, [r1,#52] \n\t" // load TK1 + "ldr.w r11, [r1,#416] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#420] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 46 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#424] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#428] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 47 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#56] \n\t" // load TK1 + "ldr.w r10, [r1,#60] \n\t" // load TK1 + "ldr.w r11, [r1,#432] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#436] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 48 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#440] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#444] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 49 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#0] \n\t" // load TK1 + "ldr.w r10, [r1,#4] \n\t" // load TK1 + "ldr.w r11, [r1,#448] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#452] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 50 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#456] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#460] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 51 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#8] \n\t" // load TK1 + "ldr.w r10, [r1,#12] \n\t" // load TK1 + "ldr.w r11, [r1,#464] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#468] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 52 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#472] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#476] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 53 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#16] \n\t" // load TK1 + "ldr.w r10, [r1,#20] \n\t" // load TK1 + "ldr.w r11, [r1,#480] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#484] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 54 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#488] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#492] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + // round 55 + + // SubCell+ShiftRow+AC(c2) + // r3 (s3 s2 s1 s0) + // r4 (s7 s6 s5 s4) + // r5 (s11 s10 s9 s8) + // r6 (s15 s14 s13 s12) + + // 1st-2nd line + // r3(s3 s2 s1 s0) + "uxtb.w r9, r3, ror #24 \n\t" // s3 + "uxtb.w r8, r3, ror #16 \n\t" // s2 + "uxtb.w r7, r3, ror #8 \n\t" // s1 + "uxtb.w r3, r3 \n\t" // s0 + // r4(s6 s5 s4 s7) + "uxtb.w r12, r4, ror #16 \n\t" // s6 + "uxtb.w r11, r4, ror #8 \n\t" // s5 + "uxtb.w r10, r4 \n\t" // s4 + "uxtb.w r4, r4, ror #24 \n\t" // s7 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "eor.w r3, r3, r7, lsl #8 \n\t" + "eor.w r3, r3, r8, lsl #16 \n\t" + "eor.w r3, r3, r9, lsl #24 \n\t" + "eor.w r4, r4, r10, lsl #8 \n\t" + "eor.w r4, r4, r11, lsl #16 \n\t" + "eor.w r4, r4, r12, lsl #24 \n\t" + + // 3rd-4th line + // r5(s9 s8 s11 s10) + "uxtb.w r9, r5, ror #8 \n\t" // s9 + "uxtb.w r8, r5 \n\t" // s8 + "uxtb.w r7, r5, ror #24 \n\t" // s11 + "uxtb.w r5, r5, ror #16 \n\t" // s10 + // r6(s12 s15 s14 s13) + "uxtb.w r12, r6 \n\t" // s12 + "uxtb.w r11, r6, ror #24 \n\t" // s15 + "uxtb.w r10, r6, ror #16 \n\t" // s14 + "uxtb.w r6, r6, ror #8 \n\t" // s13 + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r8, [r14,r8] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "eor.w r5, r5, r7, lsl #8 \n\t" + "eor.w r5, r5, r8, lsl #16 \n\t" + "eor.w r5, r5, r9, lsl #24 \n\t" + "eor.w r6, r6, r10, lsl #8 \n\t" + "eor.w r6, r6, r11, lsl #16 \n\t" + "eor.w r6, r6, r12, lsl #24 \n\t" + + // AddRoundKey and AddRoundConst(from roundKeys) + "ldr.w r9, [r1,#24] \n\t" // load TK1 + "ldr.w r10, [r1,#28] \n\t" // load TK1 + "ldr.w r11, [r1,#496] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "ldr.w r12, [r1,#500] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r11, r9 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r12, r10 \n\t" // TK1 ^ TK2 ^ TK3 ^ AC(c0 c1) + + "eor.w r8, r3, r11 \n\t" // r3 eor r11 -----------------> r8( s3 s2 s1 s0) + "eor.w r7, r4, r12, ror 24 \n\t" // r4 eor (r12 ror 24) --------> r7( s6 s5 s4 s7) + // r8( s9 s8 s11 s10) + // r6(s12 s14 s14 s13) + + // MixColumn + "eor.w r10, r8, r5 \n\t" // r8 eor r5 ---------> r10 + "eor.w r9, r7, r5 \n\t" // r7 eor r5 ---------> r9 + "eor.w r7, r6, r10 \n\t" // r6 eor r10 --------> r7 + // r8 ----------------> r8 + + // round 56 + + // SubCell+ShiftRow+AC(c2) + // r7 (s3 s2 s1 s0) + // r8 (s7 s6 s5 s4) + // r9 (s11 s10 s9 s8) + // r10(s15 s14 s13 s12) + + // 1st-2nd line + // r7(s3 s2 s1 s0) + "uxtb.w r5, r7, ror #24 \n\t" // s3 + "uxtb.w r4, r7, ror #16 \n\t" // s2 + "uxtb.w r3, r7, ror #8 \n\t" // s1 + "uxtb.w r7, r7 \n\t" // s0 + // r8(s6 s5 s4 s7) + "uxtb.w r12, r8, ror #16 \n\t" // s6 + "uxtb.w r11, r8, ror #8 \n\t" // s5 + "uxtb.w r6, r8 \n\t" // s4 + "uxtb.w r8, r8, ror #24 \n\t" // s7 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r4, [r2,r4] \n\t" + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r7, [r2,r7] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r8, [r2,r8] \n\t" + "ldr.w r0, [r1,#504] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r7, r7, r3, lsl #8 \n\t" + "eor.w r7, r7, r4, lsl #16 \n\t" + "eor.w r7, r7, r5, lsl #24 \n\t" + "eor.w r8, r8, r6, lsl #8 \n\t" + "eor.w r8, r8, r11, lsl #16 \n\t" + "eor.w r8, r8, r12, lsl #24 \n\t" + "eor.w r4, r7, r0 \n\t" // r7 eor r0 -----------------> r4( s3 s2 s1 s0) + + // 3rd-4th line + // r9(s9 s8 s11 s10) + "uxtb.w r5, r9, ror #8 \n\t" // s9 + "uxtb.w r7, r9 \n\t" // s8 + "uxtb.w r3, r9, ror #24 \n\t" // s11 + "uxtb.w r9, r9, ror #16 \n\t" // s10 + // r10(s12 s15 s14 s13) + "uxtb.w r12, r10 \n\t" // s12 + "uxtb.w r11, r10, ror #24 \n\t" // s15 + "uxtb.w r6, r10, ror #16 \n\t" // s14 + "uxtb.w r10, r10, ror #8 \n\t" // s13 + "ldrb.w r5, [r2,r5] \n\t" + "ldrb.w r7, [r14,r7] \n\t" // load from SBOX ^ c2(0x02) + "ldrb.w r3, [r2,r3] \n\t" + "ldrb.w r9, [r2,r9] \n\t" + "ldrb.w r12, [r2,r12] \n\t" + "ldrb.w r11, [r2,r11] \n\t" + "ldrb.w r6, [r2,r6] \n\t" + "ldrb.w r10, [r2,r10] \n\t" + "ldr.w r0, [r1,#508] \n\t" // load TK2 ^ TK3 ^ AC(c0 c1) + "eor.w r9, r9, r3, lsl #8 \n\t" + "eor.w r9, r9, r7, lsl #16 \n\t" + "eor.w r9, r9, r5, lsl #24 \n\t" + "eor.w r10, r10, r6, lsl #8 \n\t" + "eor.w r10, r10, r11, lsl #16 \n\t" + "eor.w r10, r10, r12, lsl #24 \n\t" + "eor.w r3, r8, r0, ror 24 \n\t" // r8 eor (r0 ror 24) --------> r3( s6 s5 s4 s7) + + // AddRoundKey and AddRoundConst(from roundKeys) + + // r9( s9 s8 s11 s10) + // r10(s12 s14 s14 s13) + + // MixColumn + "eor.w r6, r4, r9 \n\t" // r4 eor r9 --------> r6 + "eor.w r5, r3, r9 \n\t" // r3 eor r9 --------> r5 + "eor.w r3, r10, r6 \n\t" // r10 eor r6 --------> r3 + // r4 ----------------> r4 + + "ldmia.w sp!, {r0} \n\t" // pop store pointer + // r0 reload + + "str.w r3, [r0,#0] \n\t" // store ciphertext + "str.w r4, [r0,#4] \n\t" // store ciphertext + "str.w r5, [r0,#8] \n\t" // store ciphertext + "str.w r6, [r0,#12] \n\t" // store ciphertext + + "ldmia.w sp!, {r4-r12,r14} \n\t" + : + : [block] "r" (block), [roundKeys] "r" (roundKeys), [pSBOX] "" (pSBOX) + : "cc"); +} +