/******************************************************************************
* Fixsliced implementation of SKINNY-128-384.
* Two blocks are processed in parallel.
*
* This implementation doesn't compute the ShiftRows operation. Some masks and
* shifts are applied during the MixColumns operation so that the proper bits
* are XORed together. Moreover, the row permutation within the MixColumns 
* is omitted, as well as the bit permutation at the end of the Sbox. The rows
* are synchronized with the classical after only 4 rounds. However, the Sbox
* permutation requires 8 rounds for a synchronization. To limit the impact
* on code size, we compute the permutation every 4 rounds. Therefore, this
* implementation relies on a "QUADRUPLE_ROUND" routine.
*
* For more details, see the paper at: https://
*
* @author	Alexandre Adomnicai, Nanyang Technological University,
*			alexandre.adomnicai@ntu.edu.sg
*
* @date		May 2020
******************************************************************************/
#include <stdio.h>
#include <string.h>
#include "skinny128.h"
#include "tk_schedule.h"

/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 0.
****************************************************************************/
void mixcolumns_0(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = ROR(state[i],24) & 0x0c0c0c0c;
		state[i] ^= ROR(tmp,30);
		tmp = ROR(state[i],16) & 0xc0c0c0c0;
		state[i] ^= ROR(tmp,4);
		tmp = ROR(state[i],8) & 0x0c0c0c0c;
		state[i] ^= ROR(tmp,2);
	}
}

/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 1.
****************************************************************************/
void mixcolumns_1(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = ROR(state[i],16) & 0x30303030;
		state[i] ^= ROR(tmp,30);
		tmp = state[i] & 0x03030303;
		state[i] ^= ROR(tmp,28);
		tmp = ROR(state[i],16) & 0x30303030;
		state[i] ^= ROR(tmp,2);
	}
}

/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 2.
****************************************************************************/
void mixcolumns_2(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = ROR(state[i],8) & 0xc0c0c0c0;
		state[i] ^= ROR(tmp,6);
		tmp = ROR(state[i],16) & 0x0c0c0c0c;
		state[i] ^= ROR(tmp,28);
		tmp = ROR(state[i],24) & 0xc0c0c0c0;
		state[i] ^= ROR(tmp,2);
	}
}

/****************************************************************************
* The MixColumns operation for rounds i such that (i % 4) == 3.
****************************************************************************/
void mixcolumns_3(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = state[i] & 0x03030303;
		state[i] ^= ROR(tmp,30);
		tmp = state[i] & 0x30303030;
		state[i] ^= ROR(tmp,4);
		tmp = state[i] & 0x03030303;
		state[i] ^= ROR(tmp,26);
	}
}

/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 0
****************************************************************************/
void inv_mixcolumns_0(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = ROR(state[i],8) & 0x0c0c0c0c;
		state[i] ^= ROR(tmp,2);
		tmp = ROR(state[i],16) & 0xc0c0c0c0;
		state[i] ^= ROR(tmp,4);
		tmp = ROR(state[i],24) & 0x0c0c0c0c;
		state[i] ^= ROR(tmp,30);
	}
}

/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 1
****************************************************************************/
void inv_mixcolumns_1(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = ROR(state[i],16) & 0x30303030;
		state[i] ^= ROR(tmp,2);
		tmp = state[i] & 0x03030303;
		state[i] ^= ROR(tmp,28);
		tmp = ROR(state[i],16) & 0x30303030;
		state[i] ^= ROR(tmp,30);
	}
}

/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 2
****************************************************************************/
void inv_mixcolumns_2(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = ROR(state[i],24) & 0xc0c0c0c0;
		state[i] ^= ROR(tmp,2);
		tmp = ROR(state[i],16) & 0x0c0c0c0c;
		state[i] ^= ROR(tmp,28);
		tmp = ROR(state[i],8) & 0xc0c0c0c0;
		state[i] ^= ROR(tmp,6);
	}
}

/****************************************************************************
* The inverse MixColumns oepration for rounds i such that (i % 4) == 3
****************************************************************************/
void inv_mixcolumns_3(u32* state) {
	u32 tmp;
	for(int i = 0; i < 8; i++) {
		tmp = state[i] & 0x03030303;
		state[i] ^= ROR(tmp,26);
		tmp = state[i] & 0x30303030;
		state[i] ^= ROR(tmp,4);
		tmp = state[i] & 0x03030303;
		state[i] ^= ROR(tmp,30);
	}
}

/****************************************************************************
* Adds the tweakey (including the round constants) to the state.
****************************************************************************/
void add_tweakey(u32* state, const u32* rtk1, const u32* rtk2_3) {
	state[0] ^= rtk1[0] ^ rtk2_3[0];
	state[1] ^= rtk1[1] ^ rtk2_3[1]; 
	state[2] ^= rtk1[2] ^ rtk2_3[2];
	state[3] ^= rtk1[3] ^ rtk2_3[3];
	state[4] ^= rtk1[4] ^ rtk2_3[4];
	state[5] ^= rtk1[5] ^ rtk2_3[5];
	state[6] ^= rtk1[6] ^ rtk2_3[6];
	state[7] ^= rtk1[7] ^ rtk2_3[7];
}

/****************************************************************************
* Encryption of 2 blocks in parallel using SKINNY-128-384.
* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
****************************************************************************/
void skinny128_384_plus_encrypt(u8* ctext, u8* ctext_bis, const u8* ptext, 
					const u8* ptext_bis, const tweakey tk) {
	u32 state[8];
	packing(state, ptext, ptext_bis);
	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
	QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
	QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
	QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
	QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
	unpacking(ctext, ctext_bis, state);
}

/****************************************************************************
* Decryption of 2 blocks in parallel using SKINNY-128-384.
* The input parameters 'rtk1' and 'rtk2_3' are given seperately to avoid
* unnecessary recomputations of the entire tk schedule during SKINNY-AEAD-M1.
****************************************************************************/
void skinny128_384_plus_decrypt(u8* ptext, u8* ptext_bis, const u8* ctext, 
					const u8* ctext_bis, const tweakey tk) {
	u32 state[8];
	packing(state, ctext, ctext_bis);
	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+288);
	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+256);
	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+224);
	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+192);
	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+160);
	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3+128);
	INV_QUADRUPLE_ROUND(state, tk.rtk1+96, 	tk.rtk2_3+96);
	INV_QUADRUPLE_ROUND(state, tk.rtk1+64, 	tk.rtk2_3+64);
	INV_QUADRUPLE_ROUND(state, tk.rtk1+32, 	tk.rtk2_3+32);
	INV_QUADRUPLE_ROUND(state, tk.rtk1, 	tk.rtk2_3);
	unpacking(ptext, ptext_bis, state);
}