hash.c 37.5 KB
Newer Older
lwc-tester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
#include "api.h"

typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long long u64;

#define RATE (64 / 8)
#define PA_ROUNDS 12

#define ROTR8(x, n) (((x) >> (n)) | ((x) << (8 - (n))))

#define COMPRESS_BYTE_ARRAY_8(a, var_7, var_6, var_5, var_4, var_3, var_2, \
                              var_1, var_0)                                \
  do {                                                                     \
    var_0 = a[7];                                                          \
    var_1 = var_0 >> 1;                                                    \
    var_2 = var_1 >> 1;                                                    \
    var_3 = var_2 >> 1;                                                    \
    var_4 = var_3 >> 1;                                                    \
    var_5 = var_4 >> 1;                                                    \
    var_6 = var_5 >> 1;                                                    \
    var_7 = var_6 >> 1;                                                    \
    var_0 &= 1;                                                            \
    var_1 &= 1;                                                            \
    var_2 &= 1;                                                            \
    var_3 &= 1;                                                            \
    var_4 &= 1;                                                            \
    var_5 &= 1;                                                            \
    var_6 &= 1;                                                            \
    var_7 &= 1;                                                            \
    t1_0 = a[6] << 1;                                                      \
    t1_1 = a[6];                                                           \
    t1_2 = t1_1 >> 1;                                                      \
    t1_3 = t1_2 >> 1;                                                      \
    t1_4 = t1_3 >> 1;                                                      \
    t1_5 = t1_4 >> 1;                                                      \
    t1_6 = t1_5 >> 1;                                                      \
    t1_7 = t1_6 >> 1;                                                      \
    var_0 |= t1_0 & 2;                                                     \
    var_1 |= t1_1 & 2;                                                     \
    var_2 |= t1_2 & 2;                                                     \
    var_3 |= t1_3 & 2;                                                     \
    var_4 |= t1_4 & 2;                                                     \
    var_5 |= t1_5 & 2;                                                     \
    var_6 |= t1_6 & 2;                                                     \
    var_7 |= t1_7 & 2;                                                     \
    t1_1 = a[5] << 1;                                                      \
    t1_0 = t1_1 << 1;                                                      \
    t1_2 = a[5];                                                           \
    t1_3 = t1_2 >> 1;                                                      \
    t1_4 = t1_3 >> 1;                                                      \
    t1_5 = t1_4 >> 1;                                                      \
    t1_6 = t1_5 >> 1;                                                      \
    t1_7 = t1_6 >> 1;                                                      \
    var_0 |= t1_0 & 4;                                                     \
    var_1 |= t1_1 & 4;                                                     \
    var_2 |= t1_2 & 4;                                                     \
    var_3 |= t1_3 & 4;                                                     \
    var_4 |= t1_4 & 4;                                                     \
    var_5 |= t1_5 & 4;                                                     \
    var_6 |= t1_6 & 4;                                                     \
    var_7 |= t1_7 & 4;                                                     \
    t1_2 = a[4] << 1;                                                      \
    t1_1 = t1_2 << 1;                                                      \
    t1_0 = t1_1 << 1;                                                      \
    t1_3 = a[4];                                                           \
    t1_4 = t1_3 >> 1;                                                      \
    t1_5 = t1_4 >> 1;                                                      \
    t1_6 = t1_5 >> 1;                                                      \
    t1_7 = t1_6 >> 1;                                                      \
    var_0 |= t1_0 & 8;                                                     \
    var_1 |= t1_1 & 8;                                                     \
    var_2 |= t1_2 & 8;                                                     \
    var_3 |= t1_3 & 8;                                                     \
    var_4 |= t1_4 & 8;                                                     \
    var_5 |= t1_5 & 8;                                                     \
    var_6 |= t1_6 & 8;                                                     \
    var_7 |= t1_7 & 8;                                                     \
    t1_3 = a[3] << 1;                                                      \
    t1_2 = t1_3 << 1;                                                      \
    t1_1 = t1_2 << 1;                                                      \
    t1_0 = t1_1 << 1;                                                      \
    t1_4 = a[3];                                                           \
    t1_5 = t1_4 >> 1;                                                      \
    t1_6 = t1_5 >> 1;                                                      \
    t1_7 = t1_6 >> 1;                                                      \
    var_0 |= t1_0 & 16;                                                    \
    var_1 |= t1_1 & 16;                                                    \
    var_2 |= t1_2 & 16;                                                    \
    var_3 |= t1_3 & 16;                                                    \
    var_4 |= t1_4 & 16;                                                    \
    var_5 |= t1_5 & 16;                                                    \
    var_6 |= t1_6 & 16;                                                    \
    var_7 |= t1_7 & 16;                                                    \
    t1_4 = a[2] << 1;                                                      \
    t1_3 = t1_4 << 1;                                                      \
    t1_2 = t1_3 << 1;                                                      \
    t1_1 = t1_2 << 1;                                                      \
    t1_0 = t1_1 << 1;                                                      \
    t1_5 = a[2];                                                           \
    t1_6 = t1_5 >> 1;                                                      \
    t1_7 = t1_6 >> 1;                                                      \
    var_0 |= t1_0 & 32;                                                    \
    var_1 |= t1_1 & 32;                                                    \
    var_2 |= t1_2 & 32;                                                    \
    var_3 |= t1_3 & 32;                                                    \
    var_4 |= t1_4 & 32;                                                    \
    var_5 |= t1_5 & 32;                                                    \
    var_6 |= t1_6 & 32;                                                    \
    var_7 |= t1_7 & 32;                                                    \
    t1_5 = a[1] << 1;                                                      \
    t1_4 = t1_5 << 1;                                                      \
    t1_3 = t1_4 << 1;                                                      \
    t1_2 = t1_3 << 1;                                                      \
    t1_1 = t1_2 << 1;                                                      \
    t1_0 = t1_1 << 1;                                                      \
    t1_6 = a[1];                                                           \
    t1_7 = t1_6 >> 1;                                                      \
    var_0 |= t1_0 & 64;                                                    \
    var_1 |= t1_1 & 64;                                                    \
    var_2 |= t1_2 & 64;                                                    \
    var_3 |= t1_3 & 64;                                                    \
    var_4 |= t1_4 & 64;                                                    \
    var_5 |= t1_5 & 64;                                                    \
    var_6 |= t1_6 & 64;                                                    \
    var_7 |= t1_7 & 64;                                                    \
    t1_6 = a[0] << 1;                                                      \
    t1_5 = t1_6 << 1;                                                      \
    t1_4 = t1_5 << 1;                                                      \
    t1_3 = t1_4 << 1;                                                      \
    t1_2 = t1_3 << 1;                                                      \
    t1_1 = t1_2 << 1;                                                      \
    t1_0 = t1_1 << 1;                                                      \
    t1_7 = a[0];                                                           \
    var_0 |= t1_0 & 128;                                                   \
    var_1 |= t1_1 & 128;                                                   \
    var_2 |= t1_2 & 128;                                                   \
    var_3 |= t1_3 & 128;                                                   \
    var_4 |= t1_4 & 128;                                                   \
    var_5 |= t1_5 & 128;                                                   \
    var_6 |= t1_6 & 128;                                                   \
    var_7 |= t1_7 & 128;                                                   \
  } while (0)

#define EXPAND_BYTE_ARRAY_8(a, var_7, var_6, var_5, var_4, var_3, var_2, \
                            var_1, var_0)                                \
  do {                                                                   \
    a[7] = var_0 & 1;                                                    \
    t1_0 = var_0 >> 1;                                                   \
    a[6] = t1_0 & 1;                                                     \
    t1_0 >>= 1;                                                          \
    a[5] = t1_0 & 1;                                                     \
    t1_0 >>= 1;                                                          \
    a[4] = t1_0 & 1;                                                     \
    t1_0 >>= 1;                                                          \
    a[3] = t1_0 & 1;                                                     \
    t1_0 >>= 1;                                                          \
    a[2] = t1_0 & 1;                                                     \
    t1_0 >>= 1;                                                          \
    a[1] = t1_0 & 1;                                                     \
    t1_0 >>= 1;                                                          \
    a[0] = t1_0 & 1;                                                     \
    a[6] |= var_1 & 2;                                                   \
    t1_1 = var_1 << 1;                                                   \
    a[7] |= t1_1 & 2;                                                    \
    t1_1 = var_1 >> 1;                                                   \
    a[5] |= t1_1 & 2;                                                    \
    t1_1 >>= 1;                                                          \
    a[4] |= t1_1 & 2;                                                    \
    t1_1 >>= 1;                                                          \
    a[3] |= t1_1 & 2;                                                    \
    t1_1 >>= 1;                                                          \
    a[2] |= t1_1 & 2;                                                    \
    t1_1 >>= 1;                                                          \
    a[1] |= t1_1 & 2;                                                    \
    t1_1 >>= 1;                                                          \
    a[0] |= t1_1 & 2;                                                    \
    a[5] |= var_2 & 4;                                                   \
    t1_2 = var_2 << 1;                                                   \
    a[6] |= t1_2 & 4;                                                    \
    t1_2 <<= 1;                                                          \
    a[7] |= t1_2 & 4;                                                    \
    t1_2 = var_2 >> 1;                                                   \
    a[4] |= t1_2 & 4;                                                    \
    t1_2 >>= 1;                                                          \
    a[3] |= t1_2 & 4;                                                    \
    t1_2 >>= 1;                                                          \
    a[2] |= t1_2 & 4;                                                    \
    t1_2 >>= 1;                                                          \
    a[1] |= t1_2 & 4;                                                    \
    t1_2 >>= 1;                                                          \
    a[0] |= t1_2 & 4;                                                    \
    a[4] |= var_3 & 8;                                                   \
    t1_3 = var_3 << 1;                                                   \
    a[5] |= t1_3 & 8;                                                    \
    t1_3 <<= 1;                                                          \
    a[6] |= t1_3 & 8;                                                    \
    t1_3 <<= 1;                                                          \
    a[7] |= t1_3 & 8;                                                    \
    t1_3 = var_3 >> 1;                                                   \
    a[3] |= t1_3 & 8;                                                    \
    t1_3 >>= 1;                                                          \
    a[2] |= t1_3 & 8;                                                    \
    t1_3 >>= 1;                                                          \
    a[1] |= t1_3 & 8;                                                    \
    t1_3 >>= 1;                                                          \
    a[0] |= t1_3 & 8;                                                    \
    a[3] |= var_4 & 16;                                                  \
    t1_4 = var_4 << 1;                                                   \
    a[4] |= t1_4 & 16;                                                   \
    t1_4 <<= 1;                                                          \
    a[5] |= t1_4 & 16;                                                   \
    t1_4 <<= 1;                                                          \
    a[6] |= t1_4 & 16;                                                   \
    t1_4 <<= 1;                                                          \
    a[7] |= t1_4 & 16;                                                   \
    t1_4 = var_4 >> 1;                                                   \
    a[2] |= t1_4 & 16;                                                   \
    t1_4 >>= 1;                                                          \
    a[1] |= t1_4 & 16;                                                   \
    t1_4 >>= 1;                                                          \
    a[0] |= t1_4 & 16;                                                   \
    a[2] |= var_5 & 32;                                                  \
    t1_5 = var_5 << 1;                                                   \
    a[3] |= t1_5 & 32;                                                   \
    t1_5 <<= 1;                                                          \
    a[4] |= t1_5 & 32;                                                   \
    t1_5 <<= 1;                                                          \
    a[5] |= t1_5 & 32;                                                   \
    t1_5 <<= 1;                                                          \
    a[6] |= t1_5 & 32;                                                   \
    t1_5 <<= 1;                                                          \
    a[7] |= t1_5 & 32;                                                   \
    t1_5 = var_5 >> 1;                                                   \
    a[1] |= t1_5 & 32;                                                   \
    t1_5 >>= 1;                                                          \
    a[0] |= t1_5 & 32;                                                   \
    a[1] |= var_6 & 64;                                                  \
    t1_6 = var_6 << 1;                                                   \
    a[2] |= t1_6 & 64;                                                   \
    t1_6 <<= 1;                                                          \
    a[3] |= t1_6 & 64;                                                   \
    t1_6 <<= 1;                                                          \
    a[4] |= t1_6 & 64;                                                   \
    t1_6 <<= 1;                                                          \
    a[5] |= t1_6 & 64;                                                   \
    t1_6 <<= 1;                                                          \
    a[6] |= t1_6 & 64;                                                   \
    t1_6 <<= 1;                                                          \
    a[7] |= t1_6 & 64;                                                   \
    t1_6 = var_6 >> 1;                                                   \
    a[0] |= t1_6 & 64;                                                   \
    a[0] |= var_7 & 128;                                                 \
    t1_7 = var_7 << 1;                                                   \
    a[1] |= t1_7 & 128;                                                  \
    t1_7 <<= 1;                                                          \
    a[2] |= t1_7 & 128;                                                  \
    t1_7 <<= 1;                                                          \
    a[3] |= t1_7 & 128;                                                  \
    t1_7 <<= 1;                                                          \
    a[4] |= t1_7 & 128;                                                  \
    t1_7 <<= 1;                                                          \
    a[5] |= t1_7 & 128;                                                  \
    t1_7 <<= 1;                                                          \
    a[6] |= t1_7 & 128;                                                  \
    t1_7 <<= 1;                                                          \
    a[7] |= t1_7 & 128;                                                  \
  } while (0)

// This way of implementing Ascon's S-box was inpired by personal communication
// with Joan Daemen about implementing the 3-bit chi layer.
#define ROUND_16(C_7, C_6, C_5, C_4, C_3, C_2, C_1, C_0) \
  do {                                                   \
    /* round constant */                                 \
    x2_0 ^= C_0;                                         \
    x2_1 ^= C_1;                                         \
    x2_2 ^= C_2;                                         \
    x2_3 ^= C_3;                                         \
    x2_4 ^= C_4;                                         \
    x2_5 ^= C_5;                                         \
    x2_6 ^= C_6;                                         \
    x2_7 ^= C_7;                                         \
    /* s-box layer */                                    \
    x0_0 ^= x4_0;                                        \
    x4_0 ^= x3_0;                                        \
    x2_0 ^= x1_0;                                        \
    t0_0 = x0_0 & (~x4_0);                               \
    t1_0 = x2_0 & (~x1_0);                               \
    x0_0 ^= t1_0;                                        \
    t1_0 = x4_0 & (~x3_0);                               \
    x2_0 ^= t1_0;                                        \
    t1_0 = x1_0 & (~x0_0);                               \
    x4_0 ^= t1_0;                                        \
    t1_0 = x3_0 & (~x2_0);                               \
    x1_0 ^= t1_0;                                        \
    x3_0 ^= t0_0;                                        \
    x1_0 ^= x0_0;                                        \
    x3_0 ^= x2_0;                                        \
    x0_0 ^= x4_0;                                        \
    x2_0 = ~x2_0;                                        \
    x0_1 ^= x4_1;                                        \
    x4_1 ^= x3_1;                                        \
    x2_1 ^= x1_1;                                        \
    t0_0 = x0_1 & (~x4_1);                               \
    t1_0 = x2_1 & (~x1_1);                               \
    x0_1 ^= t1_0;                                        \
    t1_0 = x4_1 & (~x3_1);                               \
    x2_1 ^= t1_0;                                        \
    t1_0 = x1_1 & (~x0_1);                               \
    x4_1 ^= t1_0;                                        \
    t1_0 = x3_1 & (~x2_1);                               \
    x1_1 ^= t1_0;                                        \
    x3_1 ^= t0_0;                                        \
    x1_1 ^= x0_1;                                        \
    x3_1 ^= x2_1;                                        \
    x0_1 ^= x4_1;                                        \
    x2_1 = ~x2_1;                                        \
    x0_2 ^= x4_2;                                        \
    x4_2 ^= x3_2;                                        \
    x2_2 ^= x1_2;                                        \
    t0_0 = x0_2 & (~x4_2);                               \
    t1_0 = x2_2 & (~x1_2);                               \
    x0_2 ^= t1_0;                                        \
    t1_0 = x4_2 & (~x3_2);                               \
    x2_2 ^= t1_0;                                        \
    t1_0 = x1_2 & (~x0_2);                               \
    x4_2 ^= t1_0;                                        \
    t1_0 = x3_2 & (~x2_2);                               \
    x1_2 ^= t1_0;                                        \
    x3_2 ^= t0_0;                                        \
    x1_2 ^= x0_2;                                        \
    x3_2 ^= x2_2;                                        \
    x0_2 ^= x4_2;                                        \
    x2_2 = ~x2_2;                                        \
    x0_3 ^= x4_3;                                        \
    x4_3 ^= x3_3;                                        \
    x2_3 ^= x1_3;                                        \
    t0_0 = x0_3 & (~x4_3);                               \
    t1_0 = x2_3 & (~x1_3);                               \
    x0_3 ^= t1_0;                                        \
    t1_0 = x4_3 & (~x3_3);                               \
    x2_3 ^= t1_0;                                        \
    t1_0 = x1_3 & (~x0_3);                               \
    x4_3 ^= t1_0;                                        \
    t1_0 = x3_3 & (~x2_3);                               \
    x1_3 ^= t1_0;                                        \
    x3_3 ^= t0_0;                                        \
    x1_3 ^= x0_3;                                        \
    x3_3 ^= x2_3;                                        \
    x0_3 ^= x4_3;                                        \
    x2_3 = ~x2_3;                                        \
    x0_4 ^= x4_4;                                        \
    x4_4 ^= x3_4;                                        \
    x2_4 ^= x1_4;                                        \
    t0_0 = x0_4 & (~x4_4);                               \
    t1_0 = x2_4 & (~x1_4);                               \
    x0_4 ^= t1_0;                                        \
    t1_0 = x4_4 & (~x3_4);                               \
    x2_4 ^= t1_0;                                        \
    t1_0 = x1_4 & (~x0_4);                               \
    x4_4 ^= t1_0;                                        \
    t1_0 = x3_4 & (~x2_4);                               \
    x1_4 ^= t1_0;                                        \
    x3_4 ^= t0_0;                                        \
    x1_4 ^= x0_4;                                        \
    x3_4 ^= x2_4;                                        \
    x0_4 ^= x4_4;                                        \
    x2_4 = ~x2_4;                                        \
    x0_5 ^= x4_5;                                        \
    x4_5 ^= x3_5;                                        \
    x2_5 ^= x1_5;                                        \
    t0_0 = x0_5 & (~x4_5);                               \
    t1_0 = x2_5 & (~x1_5);                               \
    x0_5 ^= t1_0;                                        \
    t1_0 = x4_5 & (~x3_5);                               \
    x2_5 ^= t1_0;                                        \
    t1_0 = x1_5 & (~x0_5);                               \
    x4_5 ^= t1_0;                                        \
    t1_0 = x3_5 & (~x2_5);                               \
    x1_5 ^= t1_0;                                        \
    x3_5 ^= t0_0;                                        \
    x1_5 ^= x0_5;                                        \
    x3_5 ^= x2_5;                                        \
    x0_5 ^= x4_5;                                        \
    x2_5 = ~x2_5;                                        \
    x0_6 ^= x4_6;                                        \
    x4_6 ^= x3_6;                                        \
    x2_6 ^= x1_6;                                        \
    t0_0 = x0_6 & (~x4_6);                               \
    t1_0 = x2_6 & (~x1_6);                               \
    x0_6 ^= t1_0;                                        \
    t1_0 = x4_6 & (~x3_6);                               \
    x2_6 ^= t1_0;                                        \
    t1_0 = x1_6 & (~x0_6);                               \
    x4_6 ^= t1_0;                                        \
    t1_0 = x3_6 & (~x2_6);                               \
    x1_6 ^= t1_0;                                        \
    x3_6 ^= t0_0;                                        \
    x1_6 ^= x0_6;                                        \
    x3_6 ^= x2_6;                                        \
    x0_6 ^= x4_6;                                        \
    x2_6 = ~x2_6;                                        \
    x0_7 ^= x4_7;                                        \
    x4_7 ^= x3_7;                                        \
    x2_7 ^= x1_7;                                        \
    t0_0 = x0_7 & (~x4_7);                               \
    t1_0 = x2_7 & (~x1_7);                               \
    x0_7 ^= t1_0;                                        \
    t1_0 = x4_7 & (~x3_7);                               \
    x2_7 ^= t1_0;                                        \
    t1_0 = x1_7 & (~x0_7);                               \
    x4_7 ^= t1_0;                                        \
    t1_0 = x3_7 & (~x2_7);                               \
    x1_7 ^= t1_0;                                        \
    x3_7 ^= t0_0;                                        \
    x1_7 ^= x0_7;                                        \
    x3_7 ^= x2_7;                                        \
    x0_7 ^= x4_7;                                        \
    x2_7 = ~x2_7;                                        \
    /* linear layer */                                   \
    t0_0 = x0_0;                                         \
    t0_1 = x0_1;                                         \
    t0_2 = x0_2;                                         \
    t0_3 = x0_3;                                         \
    t0_4 = x0_4;                                         \
    t0_5 = x0_5;                                         \
    t0_6 = x0_6;                                         \
    t0_7 = x0_7;                                         \
    x0_5 ^= ROTR8(t0_0, 3);                              \
    x0_6 ^= ROTR8(t0_1, 3);                              \
    x0_7 ^= ROTR8(t0_2, 3);                              \
    x0_0 ^= ROTR8(t0_3, 2);                              \
    x0_1 ^= ROTR8(t0_4, 2);                              \
    x0_2 ^= ROTR8(t0_5, 2);                              \
    x0_3 ^= ROTR8(t0_6, 2);                              \
    x0_4 ^= ROTR8(t0_7, 2);                              \
    x0_4 ^= ROTR8(t0_0, 4);                              \
    x0_5 ^= ROTR8(t0_1, 4);                              \
    x0_6 ^= ROTR8(t0_2, 4);                              \
    x0_7 ^= ROTR8(t0_3, 4);                              \
    x0_0 ^= ROTR8(t0_4, 3);                              \
    x0_1 ^= ROTR8(t0_5, 3);                              \
    x0_2 ^= ROTR8(t0_6, 3);                              \
    x0_3 ^= ROTR8(t0_7, 3);                              \
    t0_0 = x1_0;                                         \
    t0_1 = x1_1;                                         \
    t0_2 = x1_2;                                         \
    t0_3 = x1_3;                                         \
    t0_4 = x1_4;                                         \
    t0_5 = x1_5;                                         \
    t0_6 = x1_6;                                         \
    t0_7 = x1_7;                                         \
    x1_3 ^= t0_0;                                        \
    x1_4 ^= t0_1;                                        \
    x1_5 ^= t0_2;                                        \
    x1_6 ^= t0_3;                                        \
    x1_7 ^= t0_4;                                        \
    x1_0 ^= ROTR8(t0_5, 7);                              \
    x1_1 ^= ROTR8(t0_6, 7);                              \
    x1_2 ^= ROTR8(t0_7, 7);                              \
    x1_1 ^= ROTR8(t0_0, 5);                              \
    x1_2 ^= ROTR8(t0_1, 5);                              \
    x1_3 ^= ROTR8(t0_2, 5);                              \
    x1_4 ^= ROTR8(t0_3, 5);                              \
    x1_5 ^= ROTR8(t0_4, 5);                              \
    x1_6 ^= ROTR8(t0_5, 5);                              \
    x1_7 ^= ROTR8(t0_6, 5);                              \
    x1_0 ^= ROTR8(t0_7, 4);                              \
    t0_0 = x2_0;                                         \
    t0_1 = x2_1;                                         \
    t0_2 = x2_2;                                         \
    t0_3 = x2_3;                                         \
    t0_4 = x2_4;                                         \
    t0_5 = x2_5;                                         \
    t0_6 = x2_6;                                         \
    t0_7 = x2_7;                                         \
    x2_7 ^= ROTR8(t0_0, 1);                              \
    x2_0 ^= t0_1;                                        \
    x2_1 ^= t0_2;                                        \
    x2_2 ^= t0_3;                                        \
    x2_3 ^= t0_4;                                        \
    x2_4 ^= t0_5;                                        \
    x2_5 ^= t0_6;                                        \
    x2_6 ^= t0_7;                                        \
    x2_2 ^= ROTR8(t0_0, 1);                              \
    x2_3 ^= ROTR8(t0_1, 1);                              \
    x2_4 ^= ROTR8(t0_2, 1);                              \
    x2_5 ^= ROTR8(t0_3, 1);                              \
    x2_6 ^= ROTR8(t0_4, 1);                              \
    x2_7 ^= ROTR8(t0_5, 1);                              \
    x2_0 ^= t0_6;                                        \
    x2_1 ^= t0_7;                                        \
    t0_0 = x3_0;                                         \
    t0_1 = x3_1;                                         \
    t0_2 = x3_2;                                         \
    t0_3 = x3_3;                                         \
    t0_4 = x3_4;                                         \
    t0_5 = x3_5;                                         \
    t0_6 = x3_6;                                         \
    t0_7 = x3_7;                                         \
    x3_6 ^= ROTR8(t0_0, 2);                              \
    x3_7 ^= ROTR8(t0_1, 2);                              \
    x3_0 ^= ROTR8(t0_2, 1);                              \
    x3_1 ^= ROTR8(t0_3, 1);                              \
    x3_2 ^= ROTR8(t0_4, 1);                              \
    x3_3 ^= ROTR8(t0_5, 1);                              \
    x3_4 ^= ROTR8(t0_6, 1);                              \
    x3_5 ^= ROTR8(t0_7, 1);                              \
    x3_7 ^= ROTR8(t0_0, 3);                              \
    x3_0 ^= ROTR8(t0_1, 2);                              \
    x3_1 ^= ROTR8(t0_2, 2);                              \
    x3_2 ^= ROTR8(t0_3, 2);                              \
    x3_3 ^= ROTR8(t0_4, 2);                              \
    x3_4 ^= ROTR8(t0_5, 2);                              \
    x3_5 ^= ROTR8(t0_6, 2);                              \
    x3_6 ^= ROTR8(t0_7, 2);                              \
    t0_0 = x4_0;                                         \
    t0_1 = x4_1;                                         \
    t0_2 = x4_2;                                         \
    t0_3 = x4_3;                                         \
    t0_4 = x4_4;                                         \
    t0_5 = x4_5;                                         \
    t0_6 = x4_6;                                         \
    t0_7 = x4_7;                                         \
    x4_1 ^= ROTR8(t0_0, 1);                              \
    x4_2 ^= ROTR8(t0_1, 1);                              \
    x4_3 ^= ROTR8(t0_2, 1);                              \
    x4_4 ^= ROTR8(t0_3, 1);                              \
    x4_5 ^= ROTR8(t0_4, 1);                              \
    x4_6 ^= ROTR8(t0_5, 1);                              \
    x4_7 ^= ROTR8(t0_6, 1);                              \
    x4_0 ^= t0_7;                                        \
    x4_7 ^= ROTR8(t0_0, 6);                              \
    x4_0 ^= ROTR8(t0_1, 5);                              \
    x4_1 ^= ROTR8(t0_2, 5);                              \
    x4_2 ^= ROTR8(t0_3, 5);                              \
    x4_3 ^= ROTR8(t0_4, 5);                              \
    x4_4 ^= ROTR8(t0_5, 5);                              \
    x4_5 ^= ROTR8(t0_6, 5);                              \
    x4_6 ^= ROTR8(t0_7, 5);                              \
  } while (0)

#define P12_8                         \
  do {                                \
    ROUND_16(1, 1, 1, 1, 0, 0, 0, 0); \
    ROUND_16(1, 1, 1, 0, 0, 0, 0, 1); \
    ROUND_16(1, 1, 0, 1, 0, 0, 1, 0); \
    ROUND_16(1, 1, 0, 0, 0, 0, 1, 1); \
    ROUND_16(1, 0, 1, 1, 0, 1, 0, 0); \
    ROUND_16(1, 0, 1, 0, 0, 1, 0, 1); \
    ROUND_16(1, 0, 0, 1, 0, 1, 1, 0); \
    ROUND_16(1, 0, 0, 0, 0, 1, 1, 1); \
    ROUND_16(0, 1, 1, 1, 1, 0, 0, 0); \
    ROUND_16(0, 1, 1, 0, 1, 0, 0, 1); \
    ROUND_16(0, 1, 0, 1, 1, 0, 1, 0); \
    ROUND_16(0, 1, 0, 0, 1, 0, 1, 1); \
  } while (0)

int crypto_hash(unsigned char *out, const unsigned char *in,
                unsigned long long inlen) {
  u64 rlen;
  u64 i;

  u8 buffer[8];

  u8 x0_0, x1_0, x2_0, x3_0, x4_0;
  u8 t0_0, t1_0;

  u8 x0_1, x1_1, x2_1, x3_1, x4_1;
  u8 t0_1, t1_1;

  u8 x0_2, x1_2, x2_2, x3_2, x4_2;
  u8 t0_2, t1_2;

  u8 x0_3, x1_3, x2_3, x3_3, x4_3;
  u8 t0_3, t1_3;

  u8 x0_4, x1_4, x2_4, x3_4, x4_4;
  u8 t0_4, t1_4;

  u8 x0_5, x1_5, x2_5, x3_5, x4_5;
  u8 t0_5, t1_5;

  u8 x0_6, x1_6, x2_6, x3_6, x4_6;
  u8 t0_6, t1_6;

  u8 x0_7, x1_7, x2_7, x3_7, x4_7;
  u8 t0_7, t1_7;

  u8 in_0, in_1, in_2, in_3, in_4, in_5, in_6, in_7;

  // initialization
  x0_0 = 0x4d;
  x0_1 = 0xdc;
  x0_2 = 0x85;
  x0_3 = 0xb9;
  x0_4 = 0x6b;
  x0_5 = 0x97;
  x0_6 = 0x8e;
  x0_7 = 0xfa;
  x1_0 = 0x94;
  x1_1 = 0xcd;
  x1_2 = 0xc;
  x1_3 = 0xa4;
  x1_4 = 0x72;
  x1_5 = 0x50;
  x1_6 = 0x8;
  x1_7 = 0xc8;
  x2_0 = 0x14;
  x2_1 = 0x73;
  x2_2 = 0x84;
  x2_3 = 0x5a;
  x2_4 = 0xbe;
  x2_5 = 0x81;
  x2_6 = 0x17;
  x2_7 = 0xfe;
  x3_0 = 0xb2;
  x3_1 = 0x82;
  x3_2 = 0x0;
  x3_3 = 0x6d;
  x3_4 = 0x6c;
  x3_5 = 0x1f;
  x3_6 = 0x87;
  x3_7 = 0x2f;
  x4_0 = 0x7e;
  x4_1 = 0x40;
  x4_2 = 0xec;
  x4_3 = 0x50;
  x4_4 = 0x88;
  x4_5 = 0xa6;
  x4_6 = 0x1b;
  x4_7 = 0x7a;

  // absorb
  rlen = inlen;
  while (rlen >= RATE) {
    COMPRESS_BYTE_ARRAY_8(in, in_7, in_6, in_5, in_4, in_3, in_2, in_1, in_0);
    x0_0 ^= in_0;
    x0_1 ^= in_1;
    x0_2 ^= in_2;
    x0_3 ^= in_3;
    x0_4 ^= in_4;
    x0_5 ^= in_5;
    x0_6 ^= in_6;
    x0_7 ^= in_7;
    P12_8;
    rlen -= RATE;
    in += RATE;
  }
  for (i = 0; i < rlen; ++i, ++in) buffer[i] = *in;
  buffer[rlen] = 0x80;
  for (i = rlen + 1; i < 8; ++i) buffer[i] = 0;
  COMPRESS_BYTE_ARRAY_8(buffer, in_7, in_6, in_5, in_4, in_3, in_2, in_1, in_0);
  x0_0 ^= in_0;
  x0_1 ^= in_1;
  x0_2 ^= in_2;
  x0_3 ^= in_3;
  x0_4 ^= in_4;
  x0_5 ^= in_5;
  x0_6 ^= in_6;
  x0_7 ^= in_7;
  P12_8;

  // squeeze (full blocks)
  rlen = CRYPTO_BYTES;
  while (rlen > RATE) {
    EXPAND_BYTE_ARRAY_8(out, x0_7, x0_6, x0_5, x0_4, x0_3, x0_2, x0_1, x0_0);
    P12_8;
    rlen -= RATE;
    out += RATE;
  }
  EXPAND_BYTE_ARRAY_8(out, x0_7, x0_6, x0_5, x0_4, x0_3, x0_2, x0_1, x0_0);

  return 0;
}