/*
 * Copyright (C) 2021 Southern Storm Software, Pty Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include "internal-photon256.h"
#include "internal-photon256-mix.h"
#include "internal-util.h"

/* Determine if PHOTON-256 should be accelerated with assembly code */
#if defined(__AVR__)
#define PHOTON128_ASM 1
#elif defined(__ARM_ARCH_ISA_THUMB) && __ARM_ARCH == 7
#define PHOTON128_ASM 1
#else
#define PHOTON128_ASM 0
#endif

#if !PHOTON128_ASM

/**
 * \brief Number of rounds in the PHOTON-256 permutation in bit-sliced form.
 */
#define PHOTON256_ROUNDS 12

/* Round constants for PHOTON-256, split out into separate bit-slices */
static uint32_t const photon256_rc[PHOTON256_ROUNDS * 8] = {
    0x00000001, 0x01010000, 0x01000000, 0x00000000, /* Round  1 */
    0x01010100, 0x00000101, 0x00010101, 0x01010101,
    0x00000001, 0x00000101, 0x01000000, 0x00000000, /* Round  2 */
    0x01010100, 0x01010000, 0x00010101, 0x01010101,
    0x00000001, 0x00000101, 0x00010101, 0x00000000, /* Round  3 */
    0x01010100, 0x01010000, 0x01000000, 0x01010101,
    0x01010100, 0x00000101, 0x00010101, 0x01010101, /* Round  4 */
    0x00000001, 0x01010000, 0x01000000, 0x00000000,
    0x00000001, 0x01010000, 0x00010101, 0x01010101, /* Round  5 */
    0x01010100, 0x00000101, 0x01000000, 0x00000000,
    0x00000001, 0x00000101, 0x01000000, 0x01010101, /* Round  6 */
    0x01010100, 0x01010000, 0x00010101, 0x00000000,
    0x01010100, 0x00000101, 0x00010101, 0x00000000, /* Round  7 */
    0x00000001, 0x01010000, 0x01000000, 0x01010101,
    0x01010100, 0x01010000, 0x00010101, 0x01010101, /* Round  8 */
    0x00000001, 0x00000101, 0x01000000, 0x00000000,
    0x00000001, 0x01010000, 0x01000000, 0x01010101, /* Round  9 */
    0x01010100, 0x00000101, 0x00010101, 0x00000000,
    0x01010100, 0x00000101, 0x01000000, 0x00000000, /* Round 10 */
    0x00000001, 0x01010000, 0x00010101, 0x01010101,
    0x00000001, 0x01010000, 0x00010101, 0x00000000, /* Round 11 */
    0x01010100, 0x00000101, 0x01000000, 0x01010101,
    0x01010100, 0x00000101, 0x01000000, 0x01010101, /* Round 12 */
    0x00000001, 0x01010000, 0x00010101, 0x00000000
};

/**
 * \brief Evaluates the PHOTON-256 S-box in bit-sliced form.
 *
 * \param x0 Slice with bit 0 of all nibbles.
 * \param x1 Slice with bit 1 of all nibbles.
 * \param x2 Slice with bit 2 of all nibbles.
 * \param x3 Slice with bit 3 of all nibbles.
 *
 * This bit-sliced S-box implementation is based on the AVR version
 * "add_avr8_bitslice_asm" from the PHOTON-Beetle reference code.
 */
#define photon256_sbox(x0, x1, x2, x3) \
    do { \
        x1 ^= x2; \
        x3 ^= (x2 & x1); \
        t1 = x3; \
        x3 = (x3 & x1) ^ x2; \
        t2 = x3; \
        x3 ^= x0; \
        x3 = ~(x3); \
        x2 = x3; \
        t2 |= x0; \
        x0 ^= t1; \
        x1 ^= x0; \
        x2 |= x1; \
        x2 ^= t1; \
        x1 ^= t2; \
        x3 ^= x1; \
    } while (0)

/* http://programming.sirrida.de/perm_fn.html#bit_permute_step */
#define bit_permute_step(_y, mask, shift) \
    do { \
        uint32_t y = (_y); \
        uint32_t t = ((y >> (shift)) ^ y) & (mask); \
        (_y) = (y ^ t) ^ (t << (shift)); \
    } while (0)

/* To convert to bit-sliced form, we first scatter bits 0..3 of the nibbles
 * to bytes 0..3 of the words.  Then we rearrange the bytes to group all
 * bits N into word N.
 *
 * Permutation generated with "http://programming.sirrida.de/calcperm.php".
 *
 * P = [0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27
 *      4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31]
 */
#define TO_BITSLICED_PERM(x) \
    do { \
        bit_permute_step(x, 0x0a0a0a0a, 3); \
        bit_permute_step(x, 0x00cc00cc, 6); \
        bit_permute_step(x, 0x0000f0f0, 12); \
        bit_permute_step(x, 0x0000ff00, 8); \
    } while (0)
#define FROM_BITSLICED_PERM(x) \
    do { \
        bit_permute_step(x, 0x00aa00aa, 7); \
        bit_permute_step(x, 0x0000cccc, 14); \
        bit_permute_step(x, 0x00f000f0, 4); \
        bit_permute_step(x, 0x0000ff00, 8); \
    } while (0)

/**
 * \brief Converts half of the PHOTON-256 state into bit-sliced form.
 *
 * \param s0 First word of the state half on output.
 * \param s1 Second word of the state half on output.
 * \param s2 Third word of the state half on output.
 * \param s3 Fourth word of the state half on output.
 * \param in Points to the input bytes to convert.
 *
 * Assumes temporary variables t0, t1, t2, and t3 are in the calling scope.
 */
#define photon256_to_sliced_half(s0, s1, s2, s3, in) \
    do { \
        t0 = le_load_word32((in)); \
        t1 = le_load_word32((in) + 4); \
        t2 = le_load_word32((in) + 8); \
        t3 = le_load_word32((in) + 12); \
        TO_BITSLICED_PERM(t0); \
        TO_BITSLICED_PERM(t1); \
        TO_BITSLICED_PERM(t2); \
        TO_BITSLICED_PERM(t3); \
        (s0) = (t0 & 0x000000FFU) | ((t1 << 8) & 0x0000FF00U) | \
               ((t2 << 16) & 0x00FF0000U) | ((t3 << 24) & 0xFF000000U); \
        (s1) = ((t0 >> 8) & 0x000000FFU) | (t1 & 0x0000FF00U) | \
               ((t2 << 8) & 0x00FF0000U) | ((t3 << 16) & 0xFF000000U); \
        (s2) = ((t0 >> 16) & 0x000000FFU) | ((t1 >> 8) & 0x0000FF00U) | \
               (t2 & 0x00FF0000U) | ((t3 << 8) & 0xFF000000U); \
        (s3) = ((t0 >> 24) & 0x000000FFU) | ((t1 >> 16) & 0x0000FF00U) | \
               ((t2 >> 8) & 0x00FF0000U) | (t3 & 0xFF000000U); \
    } while (0)

/**
 * \brief Converts half of the PHOTON-256 state into bit-sliced form.
 *
 * \param out Points to the output buffer.
 * \param s0 First word of the state half on input.
 * \param s1 Second word of the state half on input.
 * \param s2 Third word of the state half on input.
 * \param s3 Fourth word of the state half on input.
 *
 * Assumes temporary variables t0, t1, t2, and t3 are in the calling scope.
 */
#define photon256_from_sliced_half(out, s0, s1, s2, s3) \
    do { \
        t0 = ((s0) & 0x000000FFU) | (((s1) & 0x000000FFU) << 8) | \
             (((s2) & 0x000000FFU) << 16) | (((s3) & 0x000000FFU) << 24); \
        t1 = (((s0) & 0x0000FF00U) >> 8) | ((s1) & 0x0000FF00U) | \
             (((s2) & 0x0000FF00U) << 8) | (((s3) & 0x0000FF00U) << 16); \
        t2 = (((s0) & 0x00FF0000U) >> 16) | (((s1) & 0x00FF0000U) >> 8) | \
             ((s2) & 0x00FF0000U) | (((s3) & 0x00FF0000U) << 8); \
        t3 = (((s0) & 0xFF000000U) >> 24) | (((s1) & 0xFF000000U) >> 16) | \
             (((s2) & 0xFF000000U) >> 8) | ((s3) & 0xFF000000U); \
        FROM_BITSLICED_PERM(t0); \
        FROM_BITSLICED_PERM(t1); \
        FROM_BITSLICED_PERM(t2); \
        FROM_BITSLICED_PERM(t3); \
        le_store_word32((out),      t0); \
        le_store_word32((out) + 4,  t1); \
        le_store_word32((out) + 8,  t2); \
        le_store_word32((out) + 12, t3); \
    } while (0)

#if defined(LW_UTIL_LITTLE_ENDIAN)
/* Index the bit-sliced state bytes in little-endian byte order */
#define READ_ROW0() \
     (((uint32_t)(S.B[0])) | \
     (((uint32_t)(S.B[4]))  << 8)  | \
     (((uint32_t)(S.B[8]))  << 16) | \
     (((uint32_t)(S.B[12])) << 24))
#define READ_ROW1() \
     (((uint32_t)(S.B[1])) | \
     (((uint32_t)(S.B[5]))  << 8)  | \
     (((uint32_t)(S.B[9]))  << 16) | \
     (((uint32_t)(S.B[13])) << 24))
#define READ_ROW2() \
     (((uint32_t)(S.B[2])) | \
     (((uint32_t)(S.B[6]))  << 8)  | \
     (((uint32_t)(S.B[10])) << 16) | \
     (((uint32_t)(S.B[14])) << 24))
#define READ_ROW3() \
     (((uint32_t)(S.B[3])) | \
     (((uint32_t)(S.B[7]))  << 8)  | \
     (((uint32_t)(S.B[11])) << 16) | \
     (((uint32_t)(S.B[15])) << 24))
#define READ_ROW4() \
     (((uint32_t)(S.B[16])) | \
     (((uint32_t)(S.B[20])) << 8)  | \
     (((uint32_t)(S.B[24])) << 16) | \
     (((uint32_t)(S.B[28])) << 24))
#define READ_ROW5() \
     (((uint32_t)(S.B[17])) | \
     (((uint32_t)(S.B[21])) << 8)  | \
     (((uint32_t)(S.B[25])) << 16) | \
     (((uint32_t)(S.B[29])) << 24))
#define READ_ROW6() \
     (((uint32_t)(S.B[18])) | \
     (((uint32_t)(S.B[22])) << 8)  | \
     (((uint32_t)(S.B[26])) << 16) | \
     (((uint32_t)(S.B[30])) << 24))
#define READ_ROW7() \
     (((uint32_t)(S.B[19])) | \
     (((uint32_t)(S.B[23])) << 8)  | \
     (((uint32_t)(S.B[27])) << 16) | \
     (((uint32_t)(S.B[31])) << 24))
#define WRITE_ROW(row, value) \
    do { \
        if ((row) < 4) { \
            state->B[(row)]      = (uint8_t)(value); \
            state->B[(row) + 4]  = (uint8_t)((value) >> 8); \
            state->B[(row) + 8]  = (uint8_t)((value) >> 16); \
            state->B[(row) + 12] = (uint8_t)((value) >> 24); \
        } else { \
            state->B[(row) + 12] = (uint8_t)(value); \
            state->B[(row) + 16] = (uint8_t)((value) >> 8); \
            state->B[(row) + 20] = (uint8_t)((value) >> 16); \
            state->B[(row) + 24] = (uint8_t)((value) >> 24); \
        } \
    } while (0)
#else
/* Index the bit-sliced state B in big-endian byte order */
#define READ_ROW0() \
     (((uint32_t)(S.B[3])) | \
     (((uint32_t)(S.B[7]))  << 8)  | \
     (((uint32_t)(S.B[11])) << 16) | \
     (((uint32_t)(S.B[15])) << 24))
#define READ_ROW1() \
     (((uint32_t)(S.B[2])) | \
     (((uint32_t)(S.B[6]))  << 8)  | \
     (((uint32_t)(S.B[10])) << 16) | \
     (((uint32_t)(S.B[14])) << 24))
#define READ_ROW2() \
     (((uint32_t)(S.B[1])) | \
     (((uint32_t)(S.B[5]))  << 8)  | \
     (((uint32_t)(S.B[9]))  << 16) | \
     (((uint32_t)(S.B[13])) << 24))
#define READ_ROW3() \
     (((uint32_t)(S.B[0])) | \
     (((uint32_t)(S.B[4]))  << 8)  | \
     (((uint32_t)(S.B[8]))  << 16) | \
     (((uint32_t)(S.B[12])) << 24))
#define READ_ROW4() \
     (((uint32_t)(S.B[19])) | \
     (((uint32_t)(S.B[23])) << 8)  | \
     (((uint32_t)(S.B[27])) << 16) | \
     (((uint32_t)(S.B[31])) << 24))
#define READ_ROW5() \
     (((uint32_t)(S.B[18])) | \
     (((uint32_t)(S.B[22])) << 8)  | \
     (((uint32_t)(S.B[26])) << 16) | \
     (((uint32_t)(S.B[30])) << 24))
#define READ_ROW6() \
     (((uint32_t)(S.B[17])) | \
     (((uint32_t)(S.B[21])) << 8)  | \
     (((uint32_t)(S.B[25])) << 16) | \
     (((uint32_t)(S.B[29])) << 24))
#define READ_ROW7() \
     (((uint32_t)(S.B[16])) | \
     (((uint32_t)(S.B[20])) << 8)  | \
     (((uint32_t)(S.B[24])) << 16) | \
     (((uint32_t)(S.B[28])) << 24))
#define WRITE_ROW(row, value) \
    do { \
        if ((row) < 4) { \
            state->B[3  - (row)] = (uint8_t)(value); \
            state->B[7  - (row)] = (uint8_t)((value) >> 8); \
            state->B[11 - (row)] = (uint8_t)((value) >> 16); \
            state->B[15 - (row)] = (uint8_t)((value) >> 24); \
        } else { \
            state->B[20 - (row)] = (uint8_t)(value); \
            state->B[24 - (row)] = (uint8_t)((value) >> 8); \
            state->B[28 - (row)] = (uint8_t)((value) >> 16); \
            state->B[32 - (row)] = (uint8_t)((value) >> 24); \
        } \
    } while (0)
#endif

/* Rotate all rows left by the row number.
 *
 * We do this by applying permutations to the top and bottom words
 * to rearrange the bits into the rotated form.  Permutations
 * generated with "http://programming.sirrida.de/calcperm.php".
 *
 * P_top = [0 1 2 3 4 5 6 7 15 8 9 10 11 12 13 14 22 23
 *          16 17 18 19 20 21 29 30 31 24 25 26 27 28]
 * P_bot = [4 5 6 7 0 1 2 3 11 12 13 14 15 8 9 10 18 19
 *          20 21 22 23 16 17 25 26 27 28 29 30 31 24
 */
#define TOP_ROTATE_PERM(x) \
    do { \
        t1 = (x); \
        bit_permute_step(t1, 0x07030100, 4); \
        bit_permute_step(t1, 0x22331100, 2); \
        bit_permute_step(t1, 0x55005500, 1); \
        (x) = t1; \
    } while (0)
#define BOTTOM_ROTATE_PERM(x) \
    do { \
        t1 = (x); \
        bit_permute_step(t1, 0x080c0e0f, 4); \
        bit_permute_step(t1, 0x22331100, 2); \
        bit_permute_step(t1, 0x55005500, 1); \
        (x) = t1; \
    } while (0)

void photon256_permute(photon256_state_t *state)
{
    uint32_t s0, s1, s2, s3;
    uint32_t t0, t1, t2, t3;
    uint32_t t4, t5, t6, t7;
    const uint32_t *rc = photon256_rc;
    uint8_t round;

    /* Temporary state to convert from column order to row order */
    photon256_state_t S;

    /* Convert the state into bit-sliced form.  The bottom half of the
     * state is left in memory with the top half in local variables */
    photon256_to_sliced_half(s0, s1, s2, s3, state->B + 16);
    state->W[4] = s0;
    state->W[5] = s1;
    state->W[6] = s2;
    state->W[7] = s3;
    photon256_to_sliced_half(s0, s1, s2, s3, state->B);

    /* Perform all 12 permutation rounds.  To reduce the register pressure
     * on the CPU, we operate on half of the state at a time: top, bottom,
     * left, or right depending upon the step */
    for (round = 0; round < PHOTON256_ROUNDS; ++round) {
        /* Apply the round constants to the top half of the state */
        s0 ^= rc[0];
        s1 ^= rc[1];
        s2 ^= rc[2];
        s3 ^= rc[3];

        /* Apply the sbox to the top half of the state */
        photon256_sbox(s0, s1, s2, s3);

        /* Rotate the rows of the top half by 0..3 bit positions and store */
        TOP_ROTATE_PERM(s0);
        TOP_ROTATE_PERM(s1);
        TOP_ROTATE_PERM(s2);
        TOP_ROTATE_PERM(s3);
        S.W[0] = s0;
        S.W[1] = s1;
        S.W[2] = s2;
        S.W[3] = s3;

        /* Load the bottom half of the state */
        s0 = state->W[4];
        s1 = state->W[5];
        s2 = state->W[6];
        s3 = state->W[7];

        /* Apply the round constants to the bottom half of the state */
        s0 ^= rc[4];
        s1 ^= rc[5];
        s2 ^= rc[6];
        s3 ^= rc[7];
        rc += 8;

        /* Apply the sbox to the bottom half of the state */
        photon256_sbox(s0, s1, s2, s3);

        /* Rotate the rows of the bottom half by 4..7 bit positions and store */
        BOTTOM_ROTATE_PERM(s0);
        BOTTOM_ROTATE_PERM(s1);
        BOTTOM_ROTATE_PERM(s2);
        BOTTOM_ROTATE_PERM(s3);
        S.W[4] = s0;
        S.W[5] = s1;
        S.W[6] = s2;
        S.W[7] = s3;

        /* Mixing the columns; process the left half of the state */
        s0 = READ_ROW0();
        s1 = READ_ROW1();
        s2 = READ_ROW2();
        s3 = READ_ROW3();
        MIXL0(t0, s0, s1, s2, s3);
        MIXL1(t1, s0, s1, s2, s3);
        MIXL2(t2, s0, s1, s2, s3);
        MIXL3(t3, s0, s1, s2, s3);
        MIXL4(t4, s0, s1, s2, s3);
        MIXL5(t5, s0, s1, s2, s3);
        MIXL6(t6, s0, s1, s2, s3);
        MIXL7(t7, s0, s1, s2, s3);

        /* Mixing the columns; process the right half of the state */
        s0 = READ_ROW4();
        s1 = READ_ROW5();
        s2 = READ_ROW6();
        s3 = READ_ROW7();
        MIXR4(t4, s0, s1, s2, s3);
        MIXR5(t5, s0, s1, s2, s3);
        MIXR6(t6, s0, s1, s2, s3);
        MIXR7(t7, s0, s1, s2, s3);
        WRITE_ROW(4, t4);
        WRITE_ROW(5, t5);
        WRITE_ROW(6, t6);
        WRITE_ROW(7, t7);
        MIXR0(t0, s0, s1, s2, s3);
        MIXR1(t1, s0, s1, s2, s3);
        MIXR2(t2, s0, s1, s2, s3);
        MIXR3(t3, s0, s1, s2, s3);
        WRITE_ROW(0, t0);
        WRITE_ROW(1, t1);
        WRITE_ROW(2, t2);
        WRITE_ROW(3, t3);

        /* Reload the top half of the state for the next round */
        s0 = state->W[0];
        s1 = state->W[1];
        s2 = state->W[2];
        s3 = state->W[3];
    }

    /* Convert back from bit-sliced form to regular form */
    photon256_from_sliced_half(state->B, s0, s1, s2, s3);
    s0 = state->W[4];
    s1 = state->W[5];
    s2 = state->W[6];
    s3 = state->W[7];
    photon256_from_sliced_half(state->B + 16, s0, s1, s2, s3);
}

#endif /* !PHOTON128_ASM */