#include #include "api.h" #include #include #include //sse2 header file(include sse header file) #define U64BIG(x) (x) #define U32BIG(x) (x) #define PR0_ROUNDS 100 #define PR_ROUNDS 52 #define PRF_ROUNDS 56 typedef unsigned char u8; typedef unsigned long long u64; typedef long long i64; #define forward_sbox_SSE(a, b, c, d, f, g, h) \ { \ tmm1 =_mm_xor_si128( a , all1 ); \ tmm2 =_mm_and_si128( b, tmm1 ); \ tmm3 =_mm_xor_si128( c , tmm2 ); \ h=_mm_xor_si128( d , tmm3 ); \ tmm5 =_mm_or_si128 ( b, c); \ tmm6=_mm_xor_si128( d, tmm1 ); \ g=_mm_xor_si128( tmm5, tmm6 ); \ tmm8=_mm_xor_si128( b, d ); \ tmm9=_mm_and_si128( tmm3, tmm6 ); \ a=_mm_xor_si128( tmm8, tmm9 ); \ tmm11=_mm_and_si128( g, tmm8 ); \ f=_mm_xor_si128( tmm3, tmm11 ); \ } u8 constant7[100] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x41, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x61, 0x42, 0x05, 0x0a, 0x14, 0x28, 0x51, 0x23, 0x47, 0x0f, 0x1e, 0x3c, 0x79, 0x72, 0x64, 0x48, 0x11, 0x22, 0x45, 0x0b, 0x16, 0x2c, 0x59, 0x33, 0x67, 0x4e, 0x1d, 0x3a, 0x75, 0x6a, 0x54, 0x29, 0x53, 0x27, 0x4f, 0x1f, 0x3e, 0x7d, 0x7a, 0x74, 0x68, 0x50, 0x21, 0x43, 0x07, 0x0e, 0x1c, 0x38, 0x71, 0x62, 0x44, 0x09, 0x12, 0x24, 0x49, 0x13, 0x26, 0x4d, 0x1b, 0x36, 0x6d, 0x5a, 0x35, 0x6b, 0x56, 0x2d, 0x5b, 0x37, 0x6f, 0x5e, 0x3d, 0x7b, 0x76, 0x6c, 0x58, 0x31, 0x63, 0x46, 0x0d, 0x1a, 0x34, 0x69, 0x52, 0x25, 0x4b, 0x17, 0x2e, 0x5d }; #define sbox(a, b, c, d, f, g, h) \ { \ t1 = ~a; t2 = b & t1;t3 = c ^ t2; h = d ^ t3; t5 = b | c; t6 = d ^ t1; g = t5 ^ t6; t8 = b ^ d; t9 = t3 & t6; a = t8 ^ t9; t11 = g & t8; f = t3 ^ t11; \ } #define LOTR1281(a,b,n) (((a)<<(n))|((b)>>(64-n))) #define LOTR1282(a,b,n) (((b)<<(n))|((a)>>(64-n))) #define U64BIG(x) (x) #define RATE 16 #define PR0_ROUNDS 100 #define PR_ROUNDS 52 #define PRF_ROUNDS 56 #define ROUND512(i) {\ state[0] = _mm_xor_si128(state[0], _mm_set_epi64x(0, (u64)constant7[i])); \ forward_sbox_SSE(state[0], state[1], state[2], state[3], out1, out2, out3); \ state[1] = _mm_or_si128(_mm_slli_epi64(out1, 1), _mm_srli_epi64(_mm_shuffle_epi32(out1, _MM_SHUFFLE(1, 0, 3, 2)), 63)); \ state[2] = _mm_or_si128(_mm_slli_epi64(out2, 16), _mm_srli_epi64(_mm_shuffle_epi32(out2, _MM_SHUFFLE(1, 0, 3, 2)), 48)); \ state[3] = _mm_or_si128(_mm_slli_epi64(out3, 25), _mm_srli_epi64(_mm_shuffle_epi32(out3, _MM_SHUFFLE(1, 0, 3, 2)), 39)); \ } int crypto_aead_encrypt(unsigned char *c, unsigned long long *clen, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k) { *clen = mlen + CRYPTO_ABYTES; __m128i all1 = _mm_set1_epi32(0xffffffff); //__m128i tmm0, tmm1, tmm2, tmm3, tmm4, tmm5, tmm6, tmm7, tmm8, tmm9, tmm10, tmm11, out1, out2, out3; __m128i tmm1, tmm2, tmm3, tmm5, tmm6, tmm8, tmm9, tmm11, out1, out2, out3; __m128i state[4]; u64 i; u8 tempData[16] = { 0 }; // initialization state[0] = _mm_loadu_si128((__m128i*)(npub)); state[1] = _mm_loadu_si128((__m128i*)(npub + 16)); state[2] = _mm_loadu_si128((__m128i*)(k)); state[3] = _mm_loadu_si128((__m128i*)(k + 16)); for (i = 0; i < PR0_ROUNDS; i++) { ROUND512(i); } // process associated data if (adlen) { while (adlen >= RATE) { state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(ad))); for (i = 0; i < PR_ROUNDS; i++) { ROUND512(i); } adlen -= RATE; ad += RATE; } memset(tempData, 0, RATE); memcpy(tempData, ad, adlen); tempData[adlen] = 0x01; state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(tempData))); for (i = 0; i < PR_ROUNDS; i++) { ROUND512(i); } } state[3] = _mm_xor_si128(state[3], _mm_set_epi64x((u64)0x8000000000000000, 0)); // process plaintext if (mlen) { while (mlen >= RATE) { state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(m))); memcpy(c, state, RATE); for (i = 0; i < PR_ROUNDS; i++) { ROUND512(i); } mlen -= RATE; m += RATE; c += RATE; } memset(tempData, 0, RATE); memcpy(tempData, m, mlen); tempData[mlen] = 0x01; state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(tempData))); memcpy(c, state, mlen); c += mlen; } // finalization for (i = 0; i < PRF_ROUNDS; i++) { ROUND512(i); } // return tag memcpy(c, state, sizeof(unsigned char) * CRYPTO_ABYTES); return 0; } int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen, unsigned char *nsec, const unsigned char *c, unsigned long long clen, const unsigned char *ad, unsigned long long adlen, const unsigned char *npub, const unsigned char *k) { *mlen = clen - CRYPTO_ABYTES; if (clen < CRYPTO_ABYTES) return -1; __m128i all1 = _mm_set1_epi32(0xffffffff); __m128i tmm1, tmm2, tmm3, tmm5, tmm6, tmm8, tmm9, tmm11, out1, out2, out3; __m128i state[4]; u64 i; u8 tempData[16] = { 0 }; // initialization state[0] = _mm_loadu_si128((__m128i*)(npub)); state[1] = _mm_loadu_si128((__m128i*)(npub + 16)); state[2] = _mm_loadu_si128((__m128i*)(k)); state[3] = _mm_loadu_si128((__m128i*)(k + 16)); for (i = 0; i < PR0_ROUNDS; i++) { ROUND512(i); } // process associated data if (adlen) { while (adlen >= RATE) { state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(ad))); for (i = 0; i < PR_ROUNDS; i++) { ROUND512(i); } adlen -= RATE; ad += RATE; } memset(tempData, 0, RATE); memcpy(tempData, ad, adlen); tempData[adlen] = 0x01; state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(tempData))); for (i = 0; i < PR_ROUNDS; i++) { ROUND512(i); } } state[3] = _mm_xor_si128(state[3], _mm_set_epi64x((u64)0x8000000000000000, 0)); clen -= CRYPTO_ABYTES; if (clen) { while (clen >= RATE) { state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(c))); memcpy(m, state, RATE); memcpy(state, c, RATE); for (i = 0; i < PR_ROUNDS; i++) { ROUND512(i); } clen -= RATE; m += RATE; c += RATE; } memset(tempData, 0, RATE); memcpy(tempData, c, clen); tempData[clen] = 0x01; state[0] = _mm_xor_si128(state[0], _mm_loadu_si128((__m128i*)(tempData))); memcpy(m, state, clen); memcpy(state, c, clen); c += clen; } // finalization for (i = 0; i < PRF_ROUNDS; i++) { ROUND512(i); } if (memcmp((void*)state, (void*)c, CRYPTO_ABYTES)) { memset(m, 0, sizeof(unsigned char) * (*mlen)); *mlen = 0; return -1; } return 0; }