#ifndef PERMUTATIONS_H_ #define PERMUTATIONS_H_ #include "ascon.h" #include "config.h" #include "round.h" static const uint64_t C[12] = { 0xffffffffffffff0full, 0xffffffffffffff1eull, 0xffffffffffffff2dull, 0xffffffffffffff3cull, 0xffffffffffffff4bull, 0xffffffffffffff5aull, 0xffffffffffffff69ull, 0xffffffffffffff78ull, 0xffffffffffffff87ull, 0xffffffffffffff96ull, 0xffffffffffffffa5ull, 0xffffffffffffffb4ull, }; #define P12() \ __asm__ __volatile__ ( \ ".arm \n\t" \ ".fpu neon \n\t" \ "vldm %[s], {d0-d4} \n\t" \ "vmvn d2, d2 \n\t" \ ROUND(0) \ ROUND(8) \ ROUND(16) \ ROUND(24) \ ROUND(32) \ ROUND(40) \ ROUND(48) \ ROUND(56) \ ROUND(64) \ ROUND(72) \ ROUND(80) \ ROUND(88) \ "vmvn d2, d2 \n\t" \ "vstm %[s], {d0-d4} \n\t" \ :: [s] "r" (&s), [C] "r" (C) \ : "d0", "d1", "d2", "d3", "d4", \ "d10", "d11", "d12", "d13", "d14", \ "d20", "d21", "d22", "d23", "d24", \ "d31", "memory") #define P6() \ __asm__ __volatile__ ( \ ".arm \n\t" \ ".fpu neon \n\t" \ "vldm %[s], {d0-d4} \n\t" \ "vmvn d2, d2 \n\t" \ ROUND(48) \ ROUND(56) \ ROUND(64) \ ROUND(72) \ ROUND(80) \ ROUND(88) \ "vmvn d2, d2 \n\t" \ "vstm %[s], {d0-d4} \n\t" \ :: [s] "r" (&s), [C] "r" (C) \ : "d0", "d1", "d2", "d3", "d4", \ "d10", "d11", "d12", "d13", "d14", \ "d20", "d21", "d22", "d23", "d24", \ "d31", "memory") #define AD() \ do { \ uint32_t adlen_hi = (uint32_t)(adlen >> 32); \ uint32_t adlen_lo = (uint32_t)adlen; \ __asm__ __volatile__ ( \ ".arm \n\t" \ ".fpu neon \n\t" \ "cmp %[adlen_hi], #0 \n\t" \ "cmpeq %[adlen_lo], #7 \n\t" \ "bls .LAD1 \n\t" \ "vldm %[s], {d0-d4} \n\t" \ ".LAD0: \n\t" \ "vldm %[ad]!, {d16} \n\t" \ "vrev64.8 d16, d16 \n\t" \ "veor d0, d0, d16 \n\t" \ "vmvn d2, d2 \n\t" \ ROUND(48) \ ROUND(56) \ ROUND(64) \ ROUND(72) \ ROUND(80) \ ROUND(88) \ "vmvn d2, d2 \n\t" \ "sub %[adlen_lo], %[adlen_lo], #8 \n\t" \ "sbc %[adlen_hi], %[adlen_hi], #0 \n\t" \ "cmp %[adlen_hi], #0 \n\t" \ "cmpeq %[adlen_lo], #7 \n\t" \ "bhi .LAD0 \n\t" \ "vstm %[s], {d0-d4} \n\t" \ ".LAD1: \n\t" \ : [adlen_hi] "+r" (adlen_hi), [adlen_lo] "+r" (adlen_lo), \ [ad] "+r" (ad) \ : [s] "r" (&s), [C] "r" (C) \ : "d0", "d1", "d2", "d3", "d4", \ "d10", "d11", "d12", "d13", "d14", "d16", \ "d20", "d21", "d22", "d23", "d24", \ "d31", "memory"); \ adlen = (uint64_t)adlen_hi << 32 | adlen_lo; \ } while (0) #define PT() \ do { \ uint32_t mlen_hi = (uint32_t)(mlen >> 32); \ uint32_t mlen_lo = (uint32_t)mlen; \ __asm__ __volatile__ ( \ ".arm \n\t" \ ".fpu neon \n\t" \ "cmp %[mlen_hi], #0 \n\t" \ "cmpeq %[mlen_lo], #7 \n\t" \ "bls .LPT1 \n\t" \ "vldm %[s], {d0-d4} \n\t" \ ".LPT0: \n\t" \ "vldm %[m]!, {d16} \n\t" \ "vrev64.8 d16, d16 \n\t" \ "veor d0, d0, d16 \n\t" \ "vrev64.8 d26, d0 \n\t" \ "vstm %[c]!, {d26} \n\t" \ "vmvn d2, d2 \n\t" \ ROUND(48) \ ROUND(56) \ ROUND(64) \ ROUND(72) \ ROUND(80) \ ROUND(88) \ "vmvn d2, d2 \n\t" \ "sub %[mlen_lo], %[mlen_lo], #8 \n\t" \ "sbc %[mlen_hi], %[mlen_hi], #0 \n\t" \ "cmp %[mlen_hi], #0 \n\t" \ "cmpeq %[mlen_lo], #7 \n\t" \ "bhi .LPT0 \n\t" \ "vstm %[s], {d0-d4} \n\t" \ ".LPT1: \n\t" \ : [mlen_hi] "+r" (mlen_hi), [mlen_lo] "+r" (mlen_lo), \ [m] "+r" (m), [c] "+r" (c) \ : [s] "r" (&s), [C] "r" (C) \ : "d0", "d1", "d2", "d3", "d4", \ "d10", "d11", "d12", "d13", "d14", "d16", \ "d20", "d21", "d22", "d23", "d24", "d26", \ "d31", "memory"); \ mlen = (uint64_t)mlen_hi << 32 | mlen_lo; \ } while (0) #define CT() \ do { \ uint32_t clen_hi = (uint32_t)(clen >> 32); \ uint32_t clen_lo = (uint32_t)clen; \ __asm__ __volatile__ ( \ ".arm \n\t" \ ".fpu neon \n\t" \ "cmp %[clen_hi], #0 \n\t" \ "cmpeq %[clen_lo], #7 \n\t" \ "bls .LCT1 \n\t" \ "vldm %[s], {d0-d4} \n\t" \ ".LCT0: \n\t" \ "vldm %[c]!, {d26} \n\t" \ "vrev64.8 d16, d0 \n\t" \ "veor d16, d16, d26 \n\t" \ "vrev64.8 d0, d26 \n\t" \ "vstm %[m]!, {d16} \n\t" \ "vmvn d2, d2 \n\t" \ ROUND(48) \ ROUND(56) \ ROUND(64) \ ROUND(72) \ ROUND(80) \ ROUND(88) \ "vmvn d2, d2 \n\t" \ "sub %[clen_lo], %[clen_lo], #8 \n\t" \ "sbc %[clen_hi], %[clen_hi], #0 \n\t" \ "cmp %[clen_hi], #0 \n\t" \ "cmpeq %[clen_lo], #7 \n\t" \ "bhi .LCT0 \n\t" \ "vstm %[s], {d0-d4} \n\t" \ ".LCT1: \n\t" \ : [clen_hi] "+r" (clen_hi), [clen_lo] "+r" (clen_lo), \ [m] "+r" (m), [c] "+r" (c) \ : [s] "r" (&s), [C] "r" (C) \ : "d0", "d1", "d2", "d3", "d4", \ "d10", "d11", "d12", "d13", "d14", "d16", \ "d20", "d21", "d22", "d23", "d24", "d26", \ "d31", "memory"); \ clen = (uint64_t)clen_hi << 32 | clen_lo; \ } while (0) #endif /* PERMUTATIONS_H_ */