ace.c 42.1 KB
Newer Older
lwc-tester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
/* Reference implementation of ACE-128, AEAD
 Written by:
 Kalikinkar Mandal <kmandal@uwaterloo.ca>
 */

#include "ace.h"

static inline __m256i _mm256_loadu2_m128i(__m128i *high,__m128i *low)
{
        return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128(high)),_mm_loadu_si128(low),1);
}
static __attribute__((always_inline)) inline __m256i _mm256_unpacklo_epi128(__m256i a, __m256i b)
{
        return _mm256_permute2x128_si256(a, b, 0x20);
}

static __attribute__((always_inline)) inline __m256i _mm256_unpackhi_epi128(__m256i a, __m256i b)
{
        return _mm256_permute2x128_si256(a, b, 0x31);
}

void ace320( u32 *state )
{
        u8 i, j, k;
        u32 t1, t2;
        
        __m256i x[10*PARAL_INST_BY8];
        __m256i xtmp, ytmp;
        
        //Packing state
        for ( k = 0; k < PARAL_INST_BY8; k++)
        {
		xtmp = _mm256_loadu2_m128i((void *) (state + 4 + 80*k),(void *) (state + 0 + 80*k));
		ytmp = _mm256_loadu2_m128i((void *) (state + 14 + 80*k),(void *) (state + 10 + 80*k));
		xtmp = _mm256_permutevar8x32_epi32(xtmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		ytmp = _mm256_permutevar8x32_epi32(ytmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		x[10*k+0] = _mm256_permute2x128_si256(xtmp, ytmp, 0x20);
		x[10*k+4] = _mm256_permute2x128_si256(ytmp, ytmp, 0x31);
		
		xtmp = _mm256_loadu2_m128i((void *) (state + 24 + 80*k),(void *) (state + 20 + 80*k));
		ytmp = _mm256_loadu2_m128i((void *) (state + 34 + 80*k),(void *) (state + 30 + 80*k));
		xtmp = _mm256_permutevar8x32_epi32(xtmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		ytmp = _mm256_permutevar8x32_epi32(ytmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		x[10*k+1] = _mm256_permute2x128_si256(xtmp, ytmp, 0x20);
		x[10*k+5] = _mm256_permute2x128_si256(ytmp, ytmp, 0x31);
		
		xtmp = _mm256_loadu2_m128i((void *) (state + 44 + 80*k),(void *) (state + 40 + 80*k));
		ytmp = _mm256_loadu2_m128i((void *) (state + 54 + 80*k),(void *) (state + 50 + 80*k));
		xtmp = _mm256_permutevar8x32_epi32(xtmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		ytmp = _mm256_permutevar8x32_epi32(ytmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		x[10*k+2] = _mm256_permute2x128_si256(xtmp, ytmp, 0x20);
		x[10*k+6] = _mm256_permute2x128_si256(ytmp, ytmp, 0x31);
		
		xtmp = _mm256_loadu2_m128i((void *) (state + 64 + 80*k),(void *) (state + 60 + 80*k));
		ytmp = _mm256_loadu2_m128i((void *) (state + 74 + 80*k),(void *) (state + 70 + 80*k));
		xtmp = _mm256_permutevar8x32_epi32(xtmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		ytmp = _mm256_permutevar8x32_epi32(ytmp, _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0));
		x[10*k+3] = _mm256_permute2x128_si256(xtmp, ytmp, 0x20);
		x[10*k+7] = _mm256_permute2x128_si256(ytmp, ytmp, 0x31);

		x[10*k+8] = _mm256_set_epi32(state[80*k+39], state[80*k+38], state[80*k+29], state[80*k+28], state[80*k+19], state[80*k+18], state[80*k+9], state[80*k+8]);
		x[10*k+9] = _mm256_set_epi32(state[80*k+79], state[80*k+78], state[80*k+69], state[80*k+68], state[80*k+59], state[80*k+58], state[80*k+49], state[80*k+48]);

        }

        for ( i = 0; i < NUMSTEPS; i++ )
        {
                
                for ( k = 0; k < PARAL_INST_BY8; k++ )
                {
                        //SSb
                        PACK_SSb(x[10*k],x[10*k+1]);
                        for ( j = 0; j < SIMECKROUND; j++ )
                        {
                                t1 = (u32)((RC0[i] >> j)&1);
                                t2 = (u32)((RC1[i] >> j)&1);
                                ROAX(x[10*k], x[10*k+1], t1, t2);
                        }
                        UNPACK_SSb(x[10*k],x[10*k+1]);
                        
			PACK_SSb(x[10*k+2],x[10*k+3]);
                        for ( j = 0; j < SIMECKROUND; j++ )
                        {
                                t1 = (u32)((RC0[i] >> j)&1);
                                t2 = (u32)((RC1[i] >> j)&1);
                                ROAX(x[10*k+2], x[10*k+3], t1, t2);
                        }
                        UNPACK_SSb(x[10*k+2],x[10*k+3]);
			
			PACK_SSb(x[10*k+8],x[10*k+9]);
                        for ( j = 0; j < SIMECKROUND; j++ )
                        {
                                t1 = (u32)((RC2[i] >> j)&1);
                                t2 = (u32)((RC2[i] >> j)&1);
                                ROAX(x[10*k+8], x[10*k+9], t1, t2);
                        }
                        UNPACK_SSb(x[10*k+8],x[10*k+9]);

                        //ASc and MSb
                        t1 = (u32)SC0[i];
                        t2 = (u32)SC1[i];
                        x[10*k+4] = x[10*k+4]^SC(t1, t2);
                        x[10*k+5] = x[10*k+5]^SC(t1, t2);
                        x[10*k+6] = x[10*k+6]^SC(t1, t2);
                        x[10*k+7] = x[10*k+7]^SC(t1, t2);

			//R0, R4
			x[10*k] = _mm256_permute4x64_epi64(x[10*k], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+4] = _mm256_permute4x64_epi64(x[10*k+4], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpacklo_epi128(x[10*k], x[10*k+8]);
			x[10*k+4]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+4],x[10*k]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+8]^_mm256_permute4x64_epi64(x[10*k], _MM_SHUFFLE(1,0,3,2));
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpacklo_epi128(x[10*k], ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k] = xtmp;
			x[10*k+8] = x[10*k+8]&maskhi;
			x[10*k+8]^=(x[10*k+4]&masklo);
			x[10*k+4] = ytmp;

			//R1, R5
			x[10*k+1] = _mm256_permute4x64_epi64(x[10*k+1], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+5] = _mm256_permute4x64_epi64(x[10*k+5], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+1], _MM_SHUFFLE(1,0,3,2)), x[10*k+8]);
			x[10*k+5]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+5],x[10*k+1]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+8]^x[10*k+1];
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+1],_MM_SHUFFLE(1,0,3,2)), ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k+1] = xtmp;
			x[10*k+8] = x[10*k+8]&masklo;
			x[10*k+8]^=(_mm256_permute4x64_epi64(x[10*k+5], _MM_SHUFFLE(1,0,3,2))&maskhi);
			x[10*k+5] = ytmp;

			//R2, R6
			x[10*k+2] = _mm256_permute4x64_epi64(x[10*k+2], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+6] = _mm256_permute4x64_epi64(x[10*k+6], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpacklo_epi128(x[10*k+2], x[10*k+9]);
			x[10*k+6]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+6],x[10*k+2]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+9]^_mm256_permute4x64_epi64(x[10*k+2], _MM_SHUFFLE(1,0,3,2));
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpacklo_epi128(x[10*k+2], ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k+2] = xtmp;
			x[10*k+9] = x[10*k+9]&maskhi;
			x[10*k+9]^=(x[10*k+6]&masklo);
			x[10*k+6] = ytmp;

			//R3, R7
			x[10*k+3] = _mm256_permute4x64_epi64(x[10*k+3], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+7] = _mm256_permute4x64_epi64(x[10*k+7], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+3], _MM_SHUFFLE(1,0,3,2)), x[10*k+9]);
			x[10*k+7]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+7],x[10*k+3]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+9]^x[10*k+3];
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+3],_MM_SHUFFLE(1,0,3,2)), ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k+3] = xtmp;
			x[10*k+9] = x[10*k+9]&masklo;
			x[10*k+9]^=(_mm256_permute4x64_epi64(x[10*k+7], _MM_SHUFFLE(1,0,3,2))&maskhi);
			x[10*k+7] = ytmp;
                }
        }
        //Unpack state
        for ( k = 0; k < PARAL_INST_BY8; k++)
        {
		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+4],0), _mm256_extracti128_si256(x[10*k],0));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 0),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 4),_mm256_extracti128_si256(xtmp,1));
		state[8] = _mm256_extract_epi32 (x[10*k+8], 0);
		state[9] = _mm256_extract_epi32 (x[10*k+8], 1);

		 
		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+4],1), _mm256_extracti128_si256(x[10*k],1));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 10),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 14),_mm256_extracti128_si256(xtmp,1));
		_mm_storeu_si128((void *) (state + 18),_mm256_extracti128_si256(x[10*k+8],1));
		state[18] = _mm256_extract_epi32 (x[10*k+8], 2);
		state[19] = _mm256_extract_epi32 (x[10*k+8], 3);


		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+5],0), _mm256_extracti128_si256(x[10*k+1],0));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 20),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 24),_mm256_extracti128_si256(xtmp,1));
		state[28] = _mm256_extract_epi32 (x[10*k+8], 4);
		state[29] = _mm256_extract_epi32 (x[10*k+8], 5);
		 
		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+5],1), _mm256_extracti128_si256(x[10*k+1],1));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 30),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 34),_mm256_extracti128_si256(xtmp,1));
		state[38] = _mm256_extract_epi32 (x[10*k+8], 6);
		state[39] = _mm256_extract_epi32 (x[10*k+8], 7);

		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+6],0), _mm256_extracti128_si256(x[10*k+2],0));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 40),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 44),_mm256_extracti128_si256(xtmp,1));
		state[48] = _mm256_extract_epi32 (x[10*k+9], 0);
		state[49] = _mm256_extract_epi32 (x[10*k+9], 1);
		 
		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+6],1), _mm256_extracti128_si256(x[10*k+2],1));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 50),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 54),_mm256_extracti128_si256(xtmp,1));
		state[58] = _mm256_extract_epi32 (x[10*k+9], 2);
		state[59] = _mm256_extract_epi32 (x[10*k+9], 3);


		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+7],0), _mm256_extracti128_si256(x[10*k+3],0));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 60),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 64),_mm256_extracti128_si256(xtmp,1));
		state[68] = _mm256_extract_epi32 (x[10*k+9], 4);
		state[69] = _mm256_extract_epi32 (x[10*k+9], 5);
		 
		 xtmp = _mm256_set_m128i (_mm256_extracti128_si256(x[10*k+7],1), _mm256_extracti128_si256(x[10*k+3],1));
		 xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));
		 _mm_storeu_si128((void *) (state + 70),_mm256_extracti128_si256(xtmp,0));
		 _mm_storeu_si128((void *) (state + 74),_mm256_extracti128_si256(xtmp,1));
		state[78] = _mm256_extract_epi32 (x[10*k+9], 6);
		state[79] = _mm256_extract_epi32 (x[10*k+9], 7);
        }
        return;
}


void ace( __m256i *x )
{
        u8 i, j, k;
        u32 t1, t2;
        __m256i xtmp, ytmp;
        
        for ( i = 0; i < NUMSTEPS; i++ )
        {
                
                for ( k = 0; k < PARAL_INST_BY8; k++ )
                {
                        //SSb
                        PACK_SSb(x[10*k],x[10*k+1]);
                        for ( j = 0; j < SIMECKROUND; j++ )
                        {
                                t1 = (u32)((RC0[i] >> j)&1);
                                t2 = (u32)((RC1[i] >> j)&1);
                                ROAX(x[10*k], x[10*k+1], t1, t2);
                        }
                        UNPACK_SSb(x[10*k],x[10*k+1]);
                        
			PACK_SSb(x[10*k+2],x[10*k+3]);
                        for ( j = 0; j < SIMECKROUND; j++ )
                        {
                                t1 = (u32)((RC0[i] >> j)&1);
                                t2 = (u32)((RC1[i] >> j)&1);
                                ROAX(x[10*k+2], x[10*k+3], t1, t2);
                        }
                        UNPACK_SSb(x[10*k+2],x[10*k+3]);
			
			PACK_SSb(x[10*k+8],x[10*k+9]);
                        for ( j = 0; j < SIMECKROUND; j++ )
                        {
                                t1 = (u32)((RC2[i] >> j)&1);
                                t2 = (u32)((RC2[i] >> j)&1);
                                ROAX(x[10*k+8], x[10*k+9], t1, t2);
                        }
                        UNPACK_SSb(x[10*k+8],x[10*k+9]);

                        //ASc and MSb
                        t1 = (u32)SC0[i];
                        t2 = (u32)SC1[i];
                        x[10*k+4] = x[10*k+4]^SC(t1, t2);
                        x[10*k+5] = x[10*k+5]^SC(t1, t2);
                        x[10*k+6] = x[10*k+6]^SC(t1, t2);
                        x[10*k+7] = x[10*k+7]^SC(t1, t2);

			//R0, R4
			x[10*k] = _mm256_permute4x64_epi64(x[10*k], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+4] = _mm256_permute4x64_epi64(x[10*k+4], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpacklo_epi128(x[10*k], x[10*k+8]);
			x[10*k+4]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+4],x[10*k]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+8]^_mm256_permute4x64_epi64(x[10*k], _MM_SHUFFLE(1,0,3,2));
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpacklo_epi128(x[10*k], ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k] = xtmp;
			x[10*k+8] = x[10*k+8]&maskhi;
			x[10*k+8]^=(x[10*k+4]&masklo);
			x[10*k+4] = ytmp;

			//R1, R5
			x[10*k+1] = _mm256_permute4x64_epi64(x[10*k+1], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+5] = _mm256_permute4x64_epi64(x[10*k+5], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+1], _MM_SHUFFLE(1,0,3,2)), x[10*k+8]);
			x[10*k+5]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+5],x[10*k+1]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+8]^x[10*k+1];
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+1],_MM_SHUFFLE(1,0,3,2)), ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k+1] = xtmp;
			x[10*k+8] = x[10*k+8]&masklo;
			x[10*k+8]^=(_mm256_permute4x64_epi64(x[10*k+5], _MM_SHUFFLE(1,0,3,2))&maskhi);
			x[10*k+5] = ytmp;

			//R2, R6
			x[10*k+2] = _mm256_permute4x64_epi64(x[10*k+2], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+6] = _mm256_permute4x64_epi64(x[10*k+6], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpacklo_epi128(x[10*k+2], x[10*k+9]);
			x[10*k+6]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+6],x[10*k+2]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+9]^_mm256_permute4x64_epi64(x[10*k+2], _MM_SHUFFLE(1,0,3,2));
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpacklo_epi128(x[10*k+2], ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k+2] = xtmp;
			x[10*k+9] = x[10*k+9]&maskhi;
			x[10*k+9]^=(x[10*k+6]&masklo);
			x[10*k+6] = ytmp;

			//R3, R7
			x[10*k+3] = _mm256_permute4x64_epi64(x[10*k+3], _MM_SHUFFLE(2, 0, 3,1));
			x[10*k+7] = _mm256_permute4x64_epi64(x[10*k+7], _MM_SHUFFLE(3, 1, 2,0));
			xtmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+3], _MM_SHUFFLE(1,0,3,2)), x[10*k+9]);
			x[10*k+7]^=xtmp;
			xtmp = _mm256_unpackhi_epi128(x[10*k+7],x[10*k+3]);
			xtmp = _mm256_permute4x64_epi64(xtmp, _MM_SHUFFLE(3, 1, 2,0));

			ytmp = x[10*k+9]^x[10*k+3];
			ytmp^=SC((u32)SC2[i], (u32)SC2[i]);
			ytmp = _mm256_unpackhi_epi128(_mm256_permute4x64_epi64(x[10*k+3],_MM_SHUFFLE(1,0,3,2)), ytmp);
			ytmp = _mm256_permute4x64_epi64(ytmp, _MM_SHUFFLE(3, 1, 2,0));
			x[10*k+3] = xtmp;
			x[10*k+9] = x[10*k+9]&masklo;
			x[10*k+9]^=(_mm256_permute4x64_epi64(x[10*k+7], _MM_SHUFFLE(1,0,3,2))&maskhi);
			x[10*k+7] = ytmp;
                }
        }
        return;
}


int crypto_aead_encrypt( u32 *tag, u32 tlen, u32 *c, u32 *m, u32 mlen, u32 *ad, u32 adlen, u8 *k, u8 *npub, u32 klen )
{
        u8 i;
        u64 l, mblock, adblock;
        __m256i x[10*PARAL_INST_BY8];
	u32 index1, index2, index3, index4;

	mblock = mlen/2;
	adblock = adlen/2;
       
	//Initialization: Loading Key & Nonce
        for ( i = 0; i < PARAL_INST_BY8; i++ )
	{
		//Key and Nonce are loaded according to Packing.

		x[10*i+0] = _mm256_loadu2_m128i((void*) (k + 16 + 128*i), (void*) (k + 0 + 128*i)); 
		x[10*i+4] = _mm256_loadu2_m128i((void *) (npub + 16 + 128*i), (void *) (npub + 0 + 128*i));
		x[10*i+4] = x[10*i+4]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));


		
		x[10*i+1] = _mm256_loadu2_m128i((void *) (k + 48 + 128*i),(void *) (k + 32 + 128*i));
		x[10*i+5] = _mm256_loadu2_m128i((void *) (npub + 48 + 128*i), (void *) (npub + 32 + 128*i));
		x[10*i+5] = x[10*i+5]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));

		x[10*i+8] = _mm256_set_epi8(npub[128*i+63],npub[128*i+62],npub[128*i+61],npub[128*i+60],npub[128*i+59],npub[128*i+58],npub[128*i+57],npub[128*i+56], npub[128*i+47],npub[128*i+46],npub[128*i+45],npub[128*i+44],npub[128*i+43],npub[128*i+42],npub[128*i+41],npub[128*i+40], npub[128*i+31],npub[128*i+30],npub[128*i+29],npub[128*i+28],npub[128*i+27],npub[128*i+26],npub[128*i+25],npub[128*i+24],npub[128*i+15],npub[128*i+14],npub[128*i+13],npub[128*i+12],npub[128*i+11],npub[128*i+10],npub[128*i+9],npub[128*i+8]);

		x[10*i+2] = _mm256_loadu2_m128i((void *) (k + 80 + 128*i),(void *) (k + 64 + 128*i));
		x[10*i+6] = _mm256_loadu2_m128i((void *) (npub + 80 + 128*i), (void *) (npub + 64 + 128*i));
		x[10*i+6] = x[10*i+6]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));

		
		x[10*i+3] = _mm256_loadu2_m128i((void *) (k + 112 + 128*i),(void *) (k + 96 + 128*i));
		x[10*i+7] = _mm256_loadu2_m128i((void *) (npub + 112 + 128*i), (void *) (npub + 96 + 128*i));
		x[10*i+7] = x[10*i+7]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));

		x[10*i+9] = _mm256_set_epi8(npub[128*i+127],npub[128*i+126],npub[128*i+125],npub[128*i+124],npub[128*i+123],npub[128*i+122],npub[128*i+121],npub[128*i+120],npub[128*i+111],npub[128*i+110],npub[128*i+109],npub[128*i+108],npub[128*i+107],npub[128*i+106],npub[128*i+105],npub[128*i+104],npub[128*i+95],npub[128*i+94],npub[128*i+93],npub[128*i+92],npub[128*i+91],npub[128*i+90],npub[128*i+89],npub[128*i+88],npub[128*i+79],npub[128*i+78],npub[128*i+77],npub[128*i+76],npub[128*i+75],npub[128*i+74],npub[128*i+73],npub[128*i+72]);


	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+23],k[128*i+22],k[128*i+21],k[128*i+20],0x0,0x0,0x0,0x0,k[128*i+19],k[128*i+18],k[128*i+17],k[128*i+16],0x0,0x0,0x0,0x0,k[128*i+7],k[128*i+6],k[128*i+5],k[128*i+4],0x0,0x0,0x0,0x0,k[128*i+3],k[128*i+2],k[128*i+1],k[128*i+0]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+55],k[128*i+54],k[128*i+53],k[128*i+52],0x0,0x0,0x0,0x0,k[128*i+51],k[128*i+50],k[128*i+49],k[128*i+48],0x0,0x0,0x0,0x0,k[128*i+39],k[128*i+38],k[128*i+37],k[128*i+36],0x0,0x0,0x0,0x0,k[128*i+35],k[128*i+34],k[128*i+33],k[128*i+32]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+87],k[128*i+86],k[128*i+85],k[128*i+84],0x0,0x0,0x0,0x0,k[128*i+83],k[128*i+82],k[128*i+81],k[128*i+80],0x0,0x0,0x0,0x0,k[128*i+71],k[128*i+70],k[128*i+69],k[128*i+68],0x0,0x0,0x0,0x0,k[128*i+67],k[128*i+66],k[128*i+65],k[128*i+64]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+119],k[128*i+118],k[128*i+117],k[128*i+116],0x0,0x0,0x0,0x0,k[128*i+115],k[128*i+114],k[128*i+113],k[128*i+112],0x0,0x0,0x0,0x0,k[128*i+103],k[128*i+102],k[128*i+101],k[128*i+100],0x0,0x0,0x0,0x0,k[128*i+99],k[128*i+98],k[128*i+97],k[128*i+96]);

	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+31],k[128*i+30],k[128*i+29],k[128*i+28],0x0,0x0,0x0,0x0,k[128*i+27],k[128*i+26],k[128*i+25],k[128*i+24],0x0,0x0,0x0,0x0,k[128*i+15],k[128*i+14],k[128*i+13],k[128*i+12],0x0,0x0,0x0,0x0,k[128*i+11],k[128*i+10],k[128*i+9],k[128*i+8]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+63],k[128*i+62],k[128*i+61],k[128*i+60],0x0,0x0,0x0,0x0,k[128*i+59],k[128*i+58],k[128*i+57],k[128*i+56],0x0,0x0,0x0,0x0,k[128*i+47],k[128*i+46],k[128*i+45],k[128*i+44],0x0,0x0,0x0,0x0,k[128*i+43],k[128*i+42],k[128*i+41],k[128*i+40]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+95],k[128*i+94],k[128*i+93],k[128*i+92],0x0,0x0,0x0,0x0,k[128*i+91],k[128*i+90],k[128*i+89],k[128*i+88],0x0,0x0,0x0,0x0,k[128*i+79],k[128*i+78],k[128*i+77],k[128*i+76],0x0,0x0,0x0,0x0,k[128*i+75],k[128*i+74],k[128*i+73],k[128*i+72]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+127],k[128*i+126],k[128*i+125],k[128*i+124],0x0,0x0,0x0,0x0,k[128*i+123],k[128*i+122],k[128*i+121],k[128*i+120],0x0,0x0,0x0,0x0,k[128*i+111],k[128*i+110],k[128*i+109],k[128*i+108],0x0,0x0,0x0,0x0,k[128*i+107],k[128*i+106],k[128*i+105],k[128*i+104]);
	}
	ace(x);
	
	
	//Proceesing Associated Data
	for ( l = 0; l < adblock; l++ )
	{
		for ( i = 0; i < PARAL_INST_BY8; i++)
		{
			index1 = 2*l + adlen*8*i;
			index2 = 2*l + adlen*8*i+1;
			index3 = 2*l + adlen*8*i + adlen;
			index4 = 2*l + adlen*8*i + adlen + 1;
			x[10*i+0] = x[10*i+0]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);
			
			index1 = 2*l + adlen*8*i+2*adlen;
			index2 = 2*l + adlen*8*i+1+2*adlen;
			index3 = 2*l + adlen*8*i + 3*adlen;
			index4 = 2*l + adlen*8*i + 3*adlen + 1;
			x[10*i+1] = x[10*i+1]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);
			
			index1 = 2*l + adlen*8*i+4*adlen;
			index2 = 2*l + adlen*8*i+1+4*adlen;
			index3 = 2*l + adlen*8*i + 5*adlen;
			index4 = 2*l + adlen*8*i + 5*adlen + 1;
			x[10*i+2] = x[10*i+2]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);
			
			index1 = 2*l + adlen*8*i+6*adlen;
			index2 = 2*l + adlen*8*i+1+6*adlen;
			index3 = 2*l + adlen*8*i + 7*adlen;
			index4 = 2*l + adlen*8*i + 7*adlen + 1;
			x[10*i+3] = x[10*i+3]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);

			//Domain seperator
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 1)^0x00000080, 1);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 3)^0x00000080, 3);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 5)^0x00000080, 5);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 7)^0x00000080, 7);
		
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 1)^0x00000080, 1);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 3)^0x00000080, 3);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 5)^0x00000080, 5);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 7)^0x00000080, 7);
		}
		ace(x);
	}

	//Encrypt plaintext
	for ( l = 0; l < mblock; l++ )
	{
		for ( i = 0; i < PARAL_INST_BY8; i++)
		{
			index1 = 2*l + mlen*8*i;
			index2 = 2*l + mlen*8*i+1;
			index3 = 2*l + mlen*8*i + mlen;
			index4 = 2*l + mlen*8*i + mlen + 1;
			c[index1] = _mm256_extract_epi32 (x[10*i+0], 0)^m[index1];
			c[index2] = _mm256_extract_epi32 (x[10*i+0], 2)^m[index2];
			c[index3] = _mm256_extract_epi32 (x[10*i+0], 4)^m[index3];
			c[index4] = _mm256_extract_epi32 (x[10*i+0], 6)^m[index4];
			x[10*i+0] = x[10*i+0]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);
			
			index1 = 2*l + mlen*8*i+2*mlen;
			index2 = 2*l + mlen*8*i+1+2*mlen;
			index3 = 2*l + mlen*8*i + 3*mlen;
			index4 = 2*l + mlen*8*i + 3*mlen + 1;
			c[index1] = _mm256_extract_epi32 (x[10*i+1], 0)^m[index1];
			c[index2] = _mm256_extract_epi32 (x[10*i+1], 2)^m[index2];
			c[index3] = _mm256_extract_epi32 (x[10*i+1], 4)^m[index3];
			c[index4] = _mm256_extract_epi32 (x[10*i+1], 6)^m[index4];
			x[10*i+1] = x[10*i+1]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);
			
			index1 = 2*l + mlen*8*i+4*mlen;
			index2 = 2*l + mlen*8*i+1+4*mlen;
			index3 = 2*l + mlen*8*i + 5*mlen;
			index4 = 2*l + mlen*8*i + 5*mlen + 1;
			c[index1] = _mm256_extract_epi32 (x[10*i+2], 0)^m[index1];
			c[index2] = _mm256_extract_epi32 (x[10*i+2], 2)^m[index2];
			c[index3] = _mm256_extract_epi32 (x[10*i+2], 4)^m[index3];
			c[index4] = _mm256_extract_epi32 (x[10*i+2], 6)^m[index4];
			x[10*i+2] = x[10*i+2]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);
			
			index1 = 2*l + mlen*8*i+6*mlen;
			index2 = 2*l + mlen*8*i+1+6*mlen;
			index3 = 2*l + mlen*8*i + 7*mlen;
			index4 = 2*l + mlen*8*i + 7*mlen + 1;
			c[index1] = _mm256_extract_epi32 (x[10*i+3], 0)^m[index1];
			c[index2] = _mm256_extract_epi32 (x[10*i+3], 2)^m[index2];
			c[index3] = _mm256_extract_epi32 (x[10*i+3], 4)^m[index3];
			c[index4] = _mm256_extract_epi32 (x[10*i+3], 6)^m[index4];
			x[10*i+3] = x[10*i+3]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);

			//Domain seperator
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 1)^0x00000040, 1);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 3)^0x00000040, 3);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 5)^0x00000040, 5);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 7)^0x00000040, 7);
		
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 1)^0x00000040, 1);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 3)^0x00000040, 3);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 5)^0x00000040, 5);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 7)^0x00000040, 7);
		}
		ace(x);
	}
	
	// Tag Extraction Phase
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+23],k[128*i+22],k[128*i+21],k[128*i+20],0x0,0x0,0x0,0x0,k[128*i+19],k[128*i+18],k[128*i+17],k[128*i+16],0x0,0x0,0x0,0x0,k[128*i+7],k[128*i+6],k[128*i+5],k[128*i+4],0x0,0x0,0x0,0x0,k[128*i+3],k[128*i+2],k[128*i+1],k[128*i+0]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+55],k[128*i+54],k[128*i+53],k[128*i+52],0x0,0x0,0x0,0x0,k[128*i+51],k[128*i+50],k[128*i+49],k[128*i+48],0x0,0x0,0x0,0x0,k[128*i+39],k[128*i+38],k[128*i+37],k[128*i+36],0x0,0x0,0x0,0x0,k[128*i+35],k[128*i+34],k[128*i+33],k[128*i+32]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+87],k[128*i+86],k[128*i+85],k[128*i+84],0x0,0x0,0x0,0x0,k[128*i+83],k[128*i+82],k[128*i+81],k[128*i+80],0x0,0x0,0x0,0x0,k[128*i+71],k[128*i+70],k[128*i+69],k[128*i+68],0x0,0x0,0x0,0x0,k[128*i+67],k[128*i+66],k[128*i+65],k[128*i+64]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+119],k[128*i+118],k[128*i+117],k[128*i+116],0x0,0x0,0x0,0x0,k[128*i+115],k[128*i+114],k[128*i+113],k[128*i+112],0x0,0x0,0x0,0x0,k[128*i+103],k[128*i+102],k[128*i+101],k[128*i+100],0x0,0x0,0x0,0x0,k[128*i+99],k[128*i+98],k[128*i+97],k[128*i+96]);

	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+31],k[128*i+30],k[128*i+29],k[128*i+28],0x0,0x0,0x0,0x0,k[128*i+27],k[128*i+26],k[128*i+25],k[128*i+24],0x0,0x0,0x0,0x0,k[128*i+15],k[128*i+14],k[128*i+13],k[128*i+12],0x0,0x0,0x0,0x0,k[128*i+11],k[128*i+10],k[128*i+9],k[128*i+8]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+63],k[128*i+62],k[128*i+61],k[128*i+60],0x0,0x0,0x0,0x0,k[128*i+59],k[128*i+58],k[128*i+57],k[128*i+56],0x0,0x0,0x0,0x0,k[128*i+47],k[128*i+46],k[128*i+45],k[128*i+44],0x0,0x0,0x0,0x0,k[128*i+43],k[128*i+42],k[128*i+41],k[128*i+40]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+95],k[128*i+94],k[128*i+93],k[128*i+92],0x0,0x0,0x0,0x0,k[128*i+91],k[128*i+90],k[128*i+89],k[128*i+88],0x0,0x0,0x0,0x0,k[128*i+79],k[128*i+78],k[128*i+77],k[128*i+76],0x0,0x0,0x0,0x0,k[128*i+75],k[128*i+74],k[128*i+73],k[128*i+72]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+127],k[128*i+126],k[128*i+125],k[128*i+124],0x0,0x0,0x0,0x0,k[128*i+123],k[128*i+122],k[128*i+121],k[128*i+120],0x0,0x0,0x0,0x0,k[128*i+111],k[128*i+110],k[128*i+109],k[128*i+108],0x0,0x0,0x0,0x0,k[128*i+107],k[128*i+106],k[128*i+105],k[128*i+104]);

	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		 _mm_storeu_si128((void *)(tag+32*i),_mm256_extracti128_si256(x[10*i],0));
		 _mm_storeu_si128((void *)(tag+32*i+4),_mm256_extracti128_si256(x[10*i],1));
		 
		 _mm_storeu_si128((void *)(tag+32*i+8),_mm256_extracti128_si256(x[10*i+1],0));
		 _mm_storeu_si128((void *)(tag+32*i+12),_mm256_extracti128_si256(x[10*i+1],1));
		 
		 _mm_storeu_si128((void *)(tag+32*i+16),_mm256_extracti128_si256(x[10*i+2],0));
		 _mm_storeu_si128((void *)(tag+32*i+20),_mm256_extracti128_si256(x[10*i+2],1));
		 
		 _mm_storeu_si128((void *)(tag+32*i+24),_mm256_extracti128_si256(x[10*i+3],0));
		 _mm_storeu_si128((void *)(tag+32*i+28),_mm256_extracti128_si256(x[10*i+3],1));
	}
return 0;
}

int crypto_aead_decrypt( u32 *m, u32 *c, u32 mlen, u32 *tag, u32 tlen, u32 *ad, u32 adlen, u8 *k, u8 *npub, u32 klen )
{
        u8 i;
        u64 l, mblock, adblock;
        __m256i x[10*PARAL_INST_BY8];
        //__m256i xtmp, ytmp;
	u32 index1, index2, index3, index4;

	mblock = mlen/2;
	adblock = adlen/2;
       
	//Initialization: Loading Key & Nonce
        for ( i = 0; i < PARAL_INST_BY8; i++ )
	{
		//Key and Nonce are loaded according to Packing.

		x[10*i+0] = _mm256_loadu2_m128i((void *) (k + 16 + 128*i), (void *) (k + 0 + 128*i));
		
		x[10*i+4] = _mm256_loadu2_m128i((void *) (npub + 16 + 128*i), (void *) (npub + 0 + 128*i));
		x[10*i+4] = x[10*i+4]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));


		
		x[10*i+1] = _mm256_loadu2_m128i((void *) (k + 48 + 128*i),(void *) (k + 32 + 128*i));
		x[10*i+5] = _mm256_loadu2_m128i((void *) (npub + 48 + 128*i), (void *) (npub + 32 + 128*i));
		x[10*i+5] = x[10*i+5]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));

		x[10*i+8] = _mm256_set_epi8(npub[128*i+63],npub[128*i+62],npub[128*i+61],npub[128*i+60],npub[128*i+59],npub[128*i+58],npub[128*i+57],npub[128*i+56], npub[128*i+47],npub[128*i+46],npub[128*i+45],npub[128*i+44],npub[128*i+43],npub[128*i+42],npub[128*i+41],npub[128*i+40], npub[128*i+31],npub[128*i+30],npub[128*i+29],npub[128*i+28],npub[128*i+27],npub[128*i+26],npub[128*i+25],npub[128*i+24],npub[128*i+15],npub[128*i+14],npub[128*i+13],npub[128*i+12],npub[128*i+11],npub[128*i+10],npub[128*i+9],npub[128*i+8]);

		x[10*i+2] = _mm256_loadu2_m128i((void *) (k + 80 + 128*i),(void *) (k + 64 + 128*i));

		x[10*i+6] = _mm256_loadu2_m128i((void *) (npub + 80 + 128*i), (void *) (npub + 64 + 128*i));
		x[10*i+6] = x[10*i+6]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));

		
		x[10*i+3] = _mm256_loadu2_m128i((void *) (k + 112 + 128*i),(void *) (k + 96 + 128*i));
		x[10*i+7] = _mm256_loadu2_m128i((void *) (npub + 112 + 128*i), (void *) (npub + 96 + 128*i));
		x[10*i+7] = x[10*i+7]&(_mm256_set_epi64x(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff));

		x[10*i+9] = _mm256_set_epi8(npub[128*i+127],npub[128*i+126],npub[128*i+125],npub[128*i+124],npub[128*i+123],npub[128*i+122],npub[128*i+121],npub[128*i+120],npub[128*i+111],npub[128*i+110],npub[128*i+109],npub[128*i+108],npub[128*i+107],npub[128*i+106],npub[128*i+105],npub[128*i+104],npub[128*i+95],npub[128*i+94],npub[128*i+93],npub[128*i+92],npub[128*i+91],npub[128*i+90],npub[128*i+89],npub[128*i+88],npub[128*i+79],npub[128*i+78],npub[128*i+77],npub[128*i+76],npub[128*i+75],npub[128*i+74],npub[128*i+73],npub[128*i+72]);


	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+23],k[128*i+22],k[128*i+21],k[128*i+20],0x0,0x0,0x0,0x0,k[128*i+19],k[128*i+18],k[128*i+17],k[128*i+16],0x0,0x0,0x0,0x0,k[128*i+7],k[128*i+6],k[128*i+5],k[128*i+4],0x0,0x0,0x0,0x0,k[128*i+3],k[128*i+2],k[128*i+1],k[128*i+0]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+55],k[128*i+54],k[128*i+53],k[128*i+52],0x0,0x0,0x0,0x0,k[128*i+51],k[128*i+50],k[128*i+49],k[128*i+48],0x0,0x0,0x0,0x0,k[128*i+39],k[128*i+38],k[128*i+37],k[128*i+36],0x0,0x0,0x0,0x0,k[128*i+35],k[128*i+34],k[128*i+33],k[128*i+32]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+87],k[128*i+86],k[128*i+85],k[128*i+84],0x0,0x0,0x0,0x0,k[128*i+83],k[128*i+82],k[128*i+81],k[128*i+80],0x0,0x0,0x0,0x0,k[128*i+71],k[128*i+70],k[128*i+69],k[128*i+68],0x0,0x0,0x0,0x0,k[128*i+67],k[128*i+66],k[128*i+65],k[128*i+64]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+119],k[128*i+118],k[128*i+117],k[128*i+116],0x0,0x0,0x0,0x0,k[128*i+115],k[128*i+114],k[128*i+113],k[128*i+112],0x0,0x0,0x0,0x0,k[128*i+103],k[128*i+102],k[128*i+101],k[128*i+100],0x0,0x0,0x0,0x0,k[128*i+99],k[128*i+98],k[128*i+97],k[128*i+96]);

	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+31],k[128*i+30],k[128*i+29],k[128*i+28],0x0,0x0,0x0,0x0,k[128*i+27],k[128*i+26],k[128*i+25],k[128*i+24],0x0,0x0,0x0,0x0,k[128*i+15],k[128*i+14],k[128*i+13],k[128*i+12],0x0,0x0,0x0,0x0,k[128*i+11],k[128*i+10],k[128*i+9],k[128*i+8]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+63],k[128*i+62],k[128*i+61],k[128*i+60],0x0,0x0,0x0,0x0,k[128*i+59],k[128*i+58],k[128*i+57],k[128*i+56],0x0,0x0,0x0,0x0,k[128*i+47],k[128*i+46],k[128*i+45],k[128*i+44],0x0,0x0,0x0,0x0,k[128*i+43],k[128*i+42],k[128*i+41],k[128*i+40]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+95],k[128*i+94],k[128*i+93],k[128*i+92],0x0,0x0,0x0,0x0,k[128*i+91],k[128*i+90],k[128*i+89],k[128*i+88],0x0,0x0,0x0,0x0,k[128*i+79],k[128*i+78],k[128*i+77],k[128*i+76],0x0,0x0,0x0,0x0,k[128*i+75],k[128*i+74],k[128*i+73],k[128*i+72]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+127],k[128*i+126],k[128*i+125],k[128*i+124],0x0,0x0,0x0,0x0,k[128*i+123],k[128*i+122],k[128*i+121],k[128*i+120],0x0,0x0,0x0,0x0,k[128*i+111],k[128*i+110],k[128*i+109],k[128*i+108],0x0,0x0,0x0,0x0,k[128*i+107],k[128*i+106],k[128*i+105],k[128*i+104]);
	}
	ace(x);
	
	
	//Proceesing Associated Data
	for ( l = 0; l < adblock; l++ )
	{
		for ( i = 0; i < PARAL_INST_BY8; i++)
		{
			index1 = 2*l + adlen*8*i;
			index2 = 2*l + adlen*8*i+1;
			index3 = 2*l + adlen*8*i + adlen;
			index4 = 2*l + adlen*8*i + adlen + 1;
			x[10*i+0] = x[10*i+0]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);
			
			index1 = 2*l + adlen*8*i+2*adlen;
			index2 = 2*l + adlen*8*i+1+2*adlen;
			index3 = 2*l + adlen*8*i + 3*adlen;
			index4 = 2*l + adlen*8*i + 3*adlen + 1;
			x[10*i+1] = x[10*i+1]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);
			
			index1 = 2*l + adlen*8*i+4*adlen;
			index2 = 2*l + adlen*8*i+1+4*adlen;
			index3 = 2*l + adlen*8*i + 5*adlen;
			index4 = 2*l + adlen*8*i + 5*adlen + 1;
			x[10*i+2] = x[10*i+2]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);
			
			index1 = 2*l + adlen*8*i+6*adlen;
			index2 = 2*l + adlen*8*i+1+6*adlen;
			index3 = 2*l + adlen*8*i + 7*adlen;
			index4 = 2*l + adlen*8*i + 7*adlen + 1;
			x[10*i+3] = x[10*i+3]^_mm256_set_epi32(0x0, ad[index4], 0x0, ad[index3], 0x0, ad[index2], 0x0, ad[index1]);

			//Domain seperator
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 1)^0x00000080, 1);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 3)^0x00000080, 3);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 5)^0x00000080, 5);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 7)^0x00000080, 7);
		
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 1)^0x00000080, 1);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 3)^0x00000080, 3);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 5)^0x00000080, 5);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 7)^0x00000080, 7);
		}
		ace(x);
	}
        
	//Encrypt plaintext
	for ( l = 0; l < mblock; l++ )
	{
		for ( i = 0; i < PARAL_INST_BY8; i++)
		{
			index1 = 2*l + mlen*8*i;
			index2 = 2*l + mlen*8*i+1;
			index3 = 2*l + mlen*8*i + mlen;
			index4 = 2*l + mlen*8*i + mlen + 1;
			m[index1] = _mm256_extract_epi32 (x[10*i+0], 0)^c[index1];
			m[index2] = _mm256_extract_epi32 (x[10*i+0], 2)^c[index2];
			m[index3] = _mm256_extract_epi32 (x[10*i+0], 4)^c[index3];
			m[index4] = _mm256_extract_epi32 (x[10*i+0], 6)^c[index4];
			x[10*i+0] = x[10*i+0]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);
			
			index1 = 2*l + mlen*8*i+2*mlen;
			index2 = 2*l + mlen*8*i+1+2*mlen;
			index3 = 2*l + mlen*8*i + 3*mlen;
			index4 = 2*l + mlen*8*i + 3*mlen + 1;
			m[index1] = _mm256_extract_epi32 (x[10*i+1], 0)^c[index1];
			m[index2] = _mm256_extract_epi32 (x[10*i+1], 2)^c[index2];
			m[index3] = _mm256_extract_epi32 (x[10*i+1], 4)^c[index3];
			m[index4] = _mm256_extract_epi32 (x[10*i+1], 6)^c[index4];
			x[10*i+1] = x[10*i+1]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);
			
			index1 = 2*l + mlen*8*i+4*mlen;
			index2 = 2*l + mlen*8*i+1+4*mlen;
			index3 = 2*l + mlen*8*i + 5*mlen;
			index4 = 2*l + mlen*8*i + 5*mlen + 1;
			m[index1] = _mm256_extract_epi32 (x[10*i+2], 0)^c[index1];
			m[index2] = _mm256_extract_epi32 (x[10*i+2], 2)^c[index2];
			m[index3] = _mm256_extract_epi32 (x[10*i+2], 4)^c[index3];
			m[index4] = _mm256_extract_epi32 (x[10*i+2], 6)^c[index4];
			x[10*i+2] = x[10*i+2]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);
			
			index1 = 2*l + mlen*8*i+6*mlen;
			index2 = 2*l + mlen*8*i+1+6*mlen;
			index3 = 2*l + mlen*8*i + 7*mlen;
			index4 = 2*l + mlen*8*i + 7*mlen + 1;
			m[index1] = _mm256_extract_epi32 (x[10*i+3], 0)^c[index1];
			m[index2] = _mm256_extract_epi32 (x[10*i+3], 2)^c[index2];
			m[index3] = _mm256_extract_epi32 (x[10*i+3], 4)^c[index3];
			m[index4] = _mm256_extract_epi32 (x[10*i+3], 6)^c[index4];
			x[10*i+3] = x[10*i+3]^_mm256_set_epi32(0x0, m[index4], 0x0, m[index3], 0x0, m[index2], 0x0, m[index1]);

			//Domain seperator
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 1)^0x00000040, 1);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 3)^0x00000040, 3);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 5)^0x00000040, 5);
			x[10*i+8] = _mm256_insert_epi32(x[10*i+8], _mm256_extract_epi32 (x[10*i+8], 7)^0x00000040, 7);
		
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 1)^0x00000040, 1);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 3)^0x00000040, 3);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 5)^0x00000040, 5);
			x[10*i+9] = _mm256_insert_epi32(x[10*i+9], _mm256_extract_epi32 (x[10*i+9], 7)^0x00000040, 7);
		}
		ace(x);
	}

	// Tag Extraction Phase
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+23],k[128*i+22],k[128*i+21],k[128*i+20],0x0,0x0,0x0,0x0,k[128*i+19],k[128*i+18],k[128*i+17],k[128*i+16],0x0,0x0,0x0,0x0,k[128*i+7],k[128*i+6],k[128*i+5],k[128*i+4],0x0,0x0,0x0,0x0,k[128*i+3],k[128*i+2],k[128*i+1],k[128*i+0]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+55],k[128*i+54],k[128*i+53],k[128*i+52],0x0,0x0,0x0,0x0,k[128*i+51],k[128*i+50],k[128*i+49],k[128*i+48],0x0,0x0,0x0,0x0,k[128*i+39],k[128*i+38],k[128*i+37],k[128*i+36],0x0,0x0,0x0,0x0,k[128*i+35],k[128*i+34],k[128*i+33],k[128*i+32]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+87],k[128*i+86],k[128*i+85],k[128*i+84],0x0,0x0,0x0,0x0,k[128*i+83],k[128*i+82],k[128*i+81],k[128*i+80],0x0,0x0,0x0,0x0,k[128*i+71],k[128*i+70],k[128*i+69],k[128*i+68],0x0,0x0,0x0,0x0,k[128*i+67],k[128*i+66],k[128*i+65],k[128*i+64]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+119],k[128*i+118],k[128*i+117],k[128*i+116],0x0,0x0,0x0,0x0,k[128*i+115],k[128*i+114],k[128*i+113],k[128*i+112],0x0,0x0,0x0,0x0,k[128*i+103],k[128*i+102],k[128*i+101],k[128*i+100],0x0,0x0,0x0,0x0,k[128*i+99],k[128*i+98],k[128*i+97],k[128*i+96]);

	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		x[10*i+0] = x[10*i+0]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+31],k[128*i+30],k[128*i+29],k[128*i+28],0x0,0x0,0x0,0x0,k[128*i+27],k[128*i+26],k[128*i+25],k[128*i+24],0x0,0x0,0x0,0x0,k[128*i+15],k[128*i+14],k[128*i+13],k[128*i+12],0x0,0x0,0x0,0x0,k[128*i+11],k[128*i+10],k[128*i+9],k[128*i+8]);
		x[10*i+1] = x[10*i+1]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+63],k[128*i+62],k[128*i+61],k[128*i+60],0x0,0x0,0x0,0x0,k[128*i+59],k[128*i+58],k[128*i+57],k[128*i+56],0x0,0x0,0x0,0x0,k[128*i+47],k[128*i+46],k[128*i+45],k[128*i+44],0x0,0x0,0x0,0x0,k[128*i+43],k[128*i+42],k[128*i+41],k[128*i+40]);
		x[10*i+2] = x[10*i+2]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+95],k[128*i+94],k[128*i+93],k[128*i+92],0x0,0x0,0x0,0x0,k[128*i+91],k[128*i+90],k[128*i+89],k[128*i+88],0x0,0x0,0x0,0x0,k[128*i+79],k[128*i+78],k[128*i+77],k[128*i+76],0x0,0x0,0x0,0x0,k[128*i+75],k[128*i+74],k[128*i+73],k[128*i+72]);
		x[10*i+3] = x[10*i+3]^_mm256_set_epi8(0x0,0x0,0x0,0x0,k[128*i+127],k[128*i+126],k[128*i+125],k[128*i+124],0x0,0x0,0x0,0x0,k[128*i+123],k[128*i+122],k[128*i+121],k[128*i+120],0x0,0x0,0x0,0x0,k[128*i+111],k[128*i+110],k[128*i+109],k[128*i+108],0x0,0x0,0x0,0x0,k[128*i+107],k[128*i+106],k[128*i+105],k[128*i+104]);
	}
	ace(x);
	for ( i = 0; i < PARAL_INST_BY8; i++)
	{
		 _mm_storeu_si128((void *)(tag+32*i),_mm256_extracti128_si256(x[10*i],0));
		 _mm_storeu_si128((void *)(tag+32*i+4),_mm256_extracti128_si256(x[10*i],1));
		 
		 _mm_storeu_si128((void *)(tag+32*i+8),_mm256_extracti128_si256(x[10*i+1],0));
		 _mm_storeu_si128((void *)(tag+32*i+12),_mm256_extracti128_si256(x[10*i+1],1));
		 
		 _mm_storeu_si128((void *)(tag+32*i+16),_mm256_extracti128_si256(x[10*i+2],0));
		 _mm_storeu_si128((void *)(tag+32*i+20),_mm256_extracti128_si256(x[10*i+2],1));
		 
		 _mm_storeu_si128((void *)(tag+32*i+24),_mm256_extracti128_si256(x[10*i+3],0));
		 _mm_storeu_si128((void *)(tag+32*i+28),_mm256_extracti128_si256(x[10*i+3],1));
	}
return 0;
}