1a4397635SArd Biesheuvel // SPDX-License-Identifier: GPL-2.0-or-later
2a4397635SArd Biesheuvel /*
3a4397635SArd Biesheuvel  * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
4a4397635SArd Biesheuvel  */
5a4397635SArd Biesheuvel 
6a4397635SArd Biesheuvel #ifdef CONFIG_ARM64
7a4397635SArd Biesheuvel #include <asm/neon-intrinsics.h>
8a4397635SArd Biesheuvel 
9a4397635SArd Biesheuvel #define AES_ROUND	"aese %0.16b, %1.16b \n\t aesmc %0.16b, %0.16b"
10a4397635SArd Biesheuvel #else
11a4397635SArd Biesheuvel #include <arm_neon.h>
12a4397635SArd Biesheuvel 
13a4397635SArd Biesheuvel #define AES_ROUND	"aese.8 %q0, %q1 \n\t aesmc.8 %q0, %q0"
14a4397635SArd Biesheuvel #endif
15a4397635SArd Biesheuvel 
16a4397635SArd Biesheuvel #define AEGIS_BLOCK_SIZE	16
17a4397635SArd Biesheuvel 
18a4397635SArd Biesheuvel #include <stddef.h>
19*4e3901faSArnd Bergmann #include "aegis-neon.h"
20a4397635SArd Biesheuvel 
2119842963SArd Biesheuvel extern int aegis128_have_aes_insn;
2219842963SArd Biesheuvel 
23a4397635SArd Biesheuvel void *memcpy(void *dest, const void *src, size_t n);
24a4397635SArd Biesheuvel 
25a4397635SArd Biesheuvel struct aegis128_state {
26a4397635SArd Biesheuvel 	uint8x16_t v[5];
27a4397635SArd Biesheuvel };
28a4397635SArd Biesheuvel 
29389139b3SArd Biesheuvel extern const uint8_t crypto_aes_sbox[];
3019842963SArd Biesheuvel 
aegis128_load_state_neon(const void * state)31a4397635SArd Biesheuvel static struct aegis128_state aegis128_load_state_neon(const void *state)
32a4397635SArd Biesheuvel {
33a4397635SArd Biesheuvel 	return (struct aegis128_state){ {
34a4397635SArd Biesheuvel 		vld1q_u8(state),
35a4397635SArd Biesheuvel 		vld1q_u8(state + 16),
36a4397635SArd Biesheuvel 		vld1q_u8(state + 32),
37a4397635SArd Biesheuvel 		vld1q_u8(state + 48),
38a4397635SArd Biesheuvel 		vld1q_u8(state + 64)
39a4397635SArd Biesheuvel 	} };
40a4397635SArd Biesheuvel }
41a4397635SArd Biesheuvel 
aegis128_save_state_neon(struct aegis128_state st,void * state)42a4397635SArd Biesheuvel static void aegis128_save_state_neon(struct aegis128_state st, void *state)
43a4397635SArd Biesheuvel {
44a4397635SArd Biesheuvel 	vst1q_u8(state, st.v[0]);
45a4397635SArd Biesheuvel 	vst1q_u8(state + 16, st.v[1]);
46a4397635SArd Biesheuvel 	vst1q_u8(state + 32, st.v[2]);
47a4397635SArd Biesheuvel 	vst1q_u8(state + 48, st.v[3]);
48a4397635SArd Biesheuvel 	vst1q_u8(state + 64, st.v[4]);
49a4397635SArd Biesheuvel }
50a4397635SArd Biesheuvel 
51a4397635SArd Biesheuvel static inline __attribute__((always_inline))
aegis_aes_round(uint8x16_t w)52a4397635SArd Biesheuvel uint8x16_t aegis_aes_round(uint8x16_t w)
53a4397635SArd Biesheuvel {
54a4397635SArd Biesheuvel 	uint8x16_t z = {};
55a4397635SArd Biesheuvel 
5619842963SArd Biesheuvel #ifdef CONFIG_ARM64
5719842963SArd Biesheuvel 	if (!__builtin_expect(aegis128_have_aes_insn, 1)) {
58389139b3SArd Biesheuvel 		static const uint8_t shift_rows[] = {
5919842963SArd Biesheuvel 			0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
6019842963SArd Biesheuvel 			0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
6119842963SArd Biesheuvel 		};
62389139b3SArd Biesheuvel 		static const uint8_t ror32by8[] = {
6319842963SArd Biesheuvel 			0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
6419842963SArd Biesheuvel 			0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
6519842963SArd Biesheuvel 		};
6619842963SArd Biesheuvel 		uint8x16_t v;
6719842963SArd Biesheuvel 
6819842963SArd Biesheuvel 		// shift rows
69389139b3SArd Biesheuvel 		w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
7019842963SArd Biesheuvel 
7119842963SArd Biesheuvel 		// sub bytes
72389139b3SArd Biesheuvel #ifndef CONFIG_CC_IS_GCC
73389139b3SArd Biesheuvel 		v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
74389139b3SArd Biesheuvel 		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
75389139b3SArd Biesheuvel 		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
76389139b3SArd Biesheuvel 		v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
77389139b3SArd Biesheuvel #else
7819842963SArd Biesheuvel 		asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w));
7919842963SArd Biesheuvel 		w -= 0x40;
8019842963SArd Biesheuvel 		asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w));
8119842963SArd Biesheuvel 		w -= 0x40;
8219842963SArd Biesheuvel 		asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w));
8319842963SArd Biesheuvel 		w -= 0x40;
8419842963SArd Biesheuvel 		asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w));
85389139b3SArd Biesheuvel #endif
8619842963SArd Biesheuvel 
8719842963SArd Biesheuvel 		// mix columns
8819842963SArd Biesheuvel 		w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
8919842963SArd Biesheuvel 		w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
90389139b3SArd Biesheuvel 		w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
9119842963SArd Biesheuvel 
9219842963SArd Biesheuvel 		return w;
9319842963SArd Biesheuvel 	}
9419842963SArd Biesheuvel #endif
9519842963SArd Biesheuvel 
96a4397635SArd Biesheuvel 	/*
97a4397635SArd Biesheuvel 	 * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics
98a4397635SArd Biesheuvel 	 * to force the compiler to issue the aese/aesmc instructions in pairs.
99a4397635SArd Biesheuvel 	 * This is much faster on many cores, where the instruction pair can
100a4397635SArd Biesheuvel 	 * execute in a single cycle.
101a4397635SArd Biesheuvel 	 */
102a4397635SArd Biesheuvel 	asm(AES_ROUND : "+w"(w) : "w"(z));
103a4397635SArd Biesheuvel 	return w;
104a4397635SArd Biesheuvel }
105a4397635SArd Biesheuvel 
106a4397635SArd Biesheuvel static inline __attribute__((always_inline))
aegis128_update_neon(struct aegis128_state st,uint8x16_t m)107a4397635SArd Biesheuvel struct aegis128_state aegis128_update_neon(struct aegis128_state st,
108a4397635SArd Biesheuvel 					   uint8x16_t m)
109a4397635SArd Biesheuvel {
110a4397635SArd Biesheuvel 	m       ^= aegis_aes_round(st.v[4]);
111a4397635SArd Biesheuvel 	st.v[4] ^= aegis_aes_round(st.v[3]);
112a4397635SArd Biesheuvel 	st.v[3] ^= aegis_aes_round(st.v[2]);
113a4397635SArd Biesheuvel 	st.v[2] ^= aegis_aes_round(st.v[1]);
114a4397635SArd Biesheuvel 	st.v[1] ^= aegis_aes_round(st.v[0]);
115a4397635SArd Biesheuvel 	st.v[0] ^= m;
116a4397635SArd Biesheuvel 
117a4397635SArd Biesheuvel 	return st;
118a4397635SArd Biesheuvel }
119a4397635SArd Biesheuvel 
12019842963SArd Biesheuvel static inline __attribute__((always_inline))
preload_sbox(void)12119842963SArd Biesheuvel void preload_sbox(void)
12219842963SArd Biesheuvel {
12319842963SArd Biesheuvel 	if (!IS_ENABLED(CONFIG_ARM64) ||
12419842963SArd Biesheuvel 	    !IS_ENABLED(CONFIG_CC_IS_GCC) ||
12519842963SArd Biesheuvel 	    __builtin_expect(aegis128_have_aes_insn, 1))
12619842963SArd Biesheuvel 		return;
12719842963SArd Biesheuvel 
12819842963SArd Biesheuvel 	asm("ld1	{v16.16b-v19.16b}, [%0], #64	\n\t"
12919842963SArd Biesheuvel 	    "ld1	{v20.16b-v23.16b}, [%0], #64	\n\t"
13019842963SArd Biesheuvel 	    "ld1	{v24.16b-v27.16b}, [%0], #64	\n\t"
13119842963SArd Biesheuvel 	    "ld1	{v28.16b-v31.16b}, [%0]		\n\t"
13219842963SArd Biesheuvel 	    :: "r"(crypto_aes_sbox));
13319842963SArd Biesheuvel }
13419842963SArd Biesheuvel 
crypto_aegis128_init_neon(void * state,const void * key,const void * iv)13552828263SArd Biesheuvel void crypto_aegis128_init_neon(void *state, const void *key, const void *iv)
13652828263SArd Biesheuvel {
13752828263SArd Biesheuvel 	static const uint8_t const0[] = {
13852828263SArd Biesheuvel 		0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d,
13952828263SArd Biesheuvel 		0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62,
14052828263SArd Biesheuvel 	};
14152828263SArd Biesheuvel 	static const uint8_t const1[] = {
14252828263SArd Biesheuvel 		0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1,
14352828263SArd Biesheuvel 		0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd,
14452828263SArd Biesheuvel 	};
14552828263SArd Biesheuvel 	uint8x16_t k = vld1q_u8(key);
14652828263SArd Biesheuvel 	uint8x16_t kiv = k ^ vld1q_u8(iv);
14752828263SArd Biesheuvel 	struct aegis128_state st = {{
14852828263SArd Biesheuvel 		kiv,
14952828263SArd Biesheuvel 		vld1q_u8(const1),
15052828263SArd Biesheuvel 		vld1q_u8(const0),
15152828263SArd Biesheuvel 		k ^ vld1q_u8(const0),
15252828263SArd Biesheuvel 		k ^ vld1q_u8(const1),
15352828263SArd Biesheuvel 	}};
15452828263SArd Biesheuvel 	int i;
15552828263SArd Biesheuvel 
15652828263SArd Biesheuvel 	preload_sbox();
15752828263SArd Biesheuvel 
15852828263SArd Biesheuvel 	for (i = 0; i < 5; i++) {
15952828263SArd Biesheuvel 		st = aegis128_update_neon(st, k);
16052828263SArd Biesheuvel 		st = aegis128_update_neon(st, kiv);
16152828263SArd Biesheuvel 	}
16252828263SArd Biesheuvel 	aegis128_save_state_neon(st, state);
16352828263SArd Biesheuvel }
16452828263SArd Biesheuvel 
crypto_aegis128_update_neon(void * state,const void * msg)165a4397635SArd Biesheuvel void crypto_aegis128_update_neon(void *state, const void *msg)
166a4397635SArd Biesheuvel {
167a4397635SArd Biesheuvel 	struct aegis128_state st = aegis128_load_state_neon(state);
168a4397635SArd Biesheuvel 
16919842963SArd Biesheuvel 	preload_sbox();
17019842963SArd Biesheuvel 
171a4397635SArd Biesheuvel 	st = aegis128_update_neon(st, vld1q_u8(msg));
172a4397635SArd Biesheuvel 
173a4397635SArd Biesheuvel 	aegis128_save_state_neon(st, state);
174a4397635SArd Biesheuvel }
175a4397635SArd Biesheuvel 
176ad00d41bSArd Biesheuvel #ifdef CONFIG_ARM
177ad00d41bSArd Biesheuvel /*
178ad00d41bSArd Biesheuvel  * AArch32 does not provide these intrinsics natively because it does not
179ad00d41bSArd Biesheuvel  * implement the underlying instructions. AArch32 only provides 64-bit
180ad00d41bSArd Biesheuvel  * wide vtbl.8/vtbx.8 instruction, so use those instead.
181ad00d41bSArd Biesheuvel  */
vqtbl1q_u8(uint8x16_t a,uint8x16_t b)182ad00d41bSArd Biesheuvel static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
183ad00d41bSArd Biesheuvel {
184ad00d41bSArd Biesheuvel 	union {
185ad00d41bSArd Biesheuvel 		uint8x16_t	val;
186ad00d41bSArd Biesheuvel 		uint8x8x2_t	pair;
187ad00d41bSArd Biesheuvel 	} __a = { a };
188ad00d41bSArd Biesheuvel 
189ad00d41bSArd Biesheuvel 	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
190ad00d41bSArd Biesheuvel 			   vtbl2_u8(__a.pair, vget_high_u8(b)));
191ad00d41bSArd Biesheuvel }
192ad00d41bSArd Biesheuvel 
vqtbx1q_u8(uint8x16_t v,uint8x16_t a,uint8x16_t b)193ad00d41bSArd Biesheuvel static uint8x16_t vqtbx1q_u8(uint8x16_t v, uint8x16_t a, uint8x16_t b)
194ad00d41bSArd Biesheuvel {
195ad00d41bSArd Biesheuvel 	union {
196ad00d41bSArd Biesheuvel 		uint8x16_t	val;
197ad00d41bSArd Biesheuvel 		uint8x8x2_t	pair;
198ad00d41bSArd Biesheuvel 	} __a = { a };
199ad00d41bSArd Biesheuvel 
200ad00d41bSArd Biesheuvel 	return vcombine_u8(vtbx2_u8(vget_low_u8(v), __a.pair, vget_low_u8(b)),
201ad00d41bSArd Biesheuvel 			   vtbx2_u8(vget_high_u8(v), __a.pair, vget_high_u8(b)));
202ad00d41bSArd Biesheuvel }
20397b70180SArd Biesheuvel 
vminvq_s8(int8x16_t v)20497b70180SArd Biesheuvel static int8_t vminvq_s8(int8x16_t v)
20597b70180SArd Biesheuvel {
20697b70180SArd Biesheuvel 	int8x8_t s = vpmin_s8(vget_low_s8(v), vget_high_s8(v));
20797b70180SArd Biesheuvel 
20897b70180SArd Biesheuvel 	s = vpmin_s8(s, s);
20997b70180SArd Biesheuvel 	s = vpmin_s8(s, s);
21097b70180SArd Biesheuvel 	s = vpmin_s8(s, s);
21197b70180SArd Biesheuvel 
21297b70180SArd Biesheuvel 	return vget_lane_s8(s, 0);
21397b70180SArd Biesheuvel }
214ad00d41bSArd Biesheuvel #endif
215ad00d41bSArd Biesheuvel 
216ad00d41bSArd Biesheuvel static const uint8_t permute[] __aligned(64) = {
217ad00d41bSArd Biesheuvel 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
218ad00d41bSArd Biesheuvel 	 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
219ad00d41bSArd Biesheuvel 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
220ad00d41bSArd Biesheuvel };
221ad00d41bSArd Biesheuvel 
crypto_aegis128_encrypt_chunk_neon(void * state,void * dst,const void * src,unsigned int size)222a4397635SArd Biesheuvel void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src,
223a4397635SArd Biesheuvel 					unsigned int size)
224a4397635SArd Biesheuvel {
225a4397635SArd Biesheuvel 	struct aegis128_state st = aegis128_load_state_neon(state);
226ad00d41bSArd Biesheuvel 	const int short_input = size < AEGIS_BLOCK_SIZE;
227a4397635SArd Biesheuvel 	uint8x16_t msg;
228a4397635SArd Biesheuvel 
22919842963SArd Biesheuvel 	preload_sbox();
23019842963SArd Biesheuvel 
231a4397635SArd Biesheuvel 	while (size >= AEGIS_BLOCK_SIZE) {
232a4397635SArd Biesheuvel 		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
233a4397635SArd Biesheuvel 
234a4397635SArd Biesheuvel 		msg = vld1q_u8(src);
235a4397635SArd Biesheuvel 		st = aegis128_update_neon(st, msg);
236ad00d41bSArd Biesheuvel 		msg ^= s;
237ad00d41bSArd Biesheuvel 		vst1q_u8(dst, msg);
238a4397635SArd Biesheuvel 
239a4397635SArd Biesheuvel 		size -= AEGIS_BLOCK_SIZE;
240a4397635SArd Biesheuvel 		src += AEGIS_BLOCK_SIZE;
241a4397635SArd Biesheuvel 		dst += AEGIS_BLOCK_SIZE;
242a4397635SArd Biesheuvel 	}
243a4397635SArd Biesheuvel 
244a4397635SArd Biesheuvel 	if (size > 0) {
245a4397635SArd Biesheuvel 		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
246ad00d41bSArd Biesheuvel 		uint8_t buf[AEGIS_BLOCK_SIZE];
247ad00d41bSArd Biesheuvel 		const void *in = src;
248ad00d41bSArd Biesheuvel 		void *out = dst;
249ad00d41bSArd Biesheuvel 		uint8x16_t m;
250a4397635SArd Biesheuvel 
251ad00d41bSArd Biesheuvel 		if (__builtin_expect(short_input, 0))
252ad00d41bSArd Biesheuvel 			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
253ad00d41bSArd Biesheuvel 
254ad00d41bSArd Biesheuvel 		m = vqtbl1q_u8(vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
255ad00d41bSArd Biesheuvel 			       vld1q_u8(permute + 32 - size));
256ad00d41bSArd Biesheuvel 
257ad00d41bSArd Biesheuvel 		st = aegis128_update_neon(st, m);
258ad00d41bSArd Biesheuvel 
259ad00d41bSArd Biesheuvel 		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
260ad00d41bSArd Biesheuvel 			 vqtbl1q_u8(m ^ s, vld1q_u8(permute + size)));
261ad00d41bSArd Biesheuvel 
262ad00d41bSArd Biesheuvel 		if (__builtin_expect(short_input, 0))
263ad00d41bSArd Biesheuvel 			memcpy(dst, out, size);
264ad00d41bSArd Biesheuvel 		else
265ad00d41bSArd Biesheuvel 			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
266a4397635SArd Biesheuvel 	}
267a4397635SArd Biesheuvel 
268a4397635SArd Biesheuvel 	aegis128_save_state_neon(st, state);
269a4397635SArd Biesheuvel }
270a4397635SArd Biesheuvel 
crypto_aegis128_decrypt_chunk_neon(void * state,void * dst,const void * src,unsigned int size)271a4397635SArd Biesheuvel void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src,
272a4397635SArd Biesheuvel 					unsigned int size)
273a4397635SArd Biesheuvel {
274a4397635SArd Biesheuvel 	struct aegis128_state st = aegis128_load_state_neon(state);
275ad00d41bSArd Biesheuvel 	const int short_input = size < AEGIS_BLOCK_SIZE;
276a4397635SArd Biesheuvel 	uint8x16_t msg;
277a4397635SArd Biesheuvel 
27819842963SArd Biesheuvel 	preload_sbox();
27919842963SArd Biesheuvel 
280a4397635SArd Biesheuvel 	while (size >= AEGIS_BLOCK_SIZE) {
281a4397635SArd Biesheuvel 		msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
282a4397635SArd Biesheuvel 		st = aegis128_update_neon(st, msg);
283a4397635SArd Biesheuvel 		vst1q_u8(dst, msg);
284a4397635SArd Biesheuvel 
285a4397635SArd Biesheuvel 		size -= AEGIS_BLOCK_SIZE;
286a4397635SArd Biesheuvel 		src += AEGIS_BLOCK_SIZE;
287a4397635SArd Biesheuvel 		dst += AEGIS_BLOCK_SIZE;
288a4397635SArd Biesheuvel 	}
289a4397635SArd Biesheuvel 
290a4397635SArd Biesheuvel 	if (size > 0) {
291a4397635SArd Biesheuvel 		uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4];
292a4397635SArd Biesheuvel 		uint8_t buf[AEGIS_BLOCK_SIZE];
293ad00d41bSArd Biesheuvel 		const void *in = src;
294ad00d41bSArd Biesheuvel 		void *out = dst;
295ad00d41bSArd Biesheuvel 		uint8x16_t m;
296a4397635SArd Biesheuvel 
297ad00d41bSArd Biesheuvel 		if (__builtin_expect(short_input, 0))
298ad00d41bSArd Biesheuvel 			in = out = memcpy(buf + AEGIS_BLOCK_SIZE - size, src, size);
299a4397635SArd Biesheuvel 
300ad00d41bSArd Biesheuvel 		m = s ^ vqtbx1q_u8(s, vld1q_u8(in + size - AEGIS_BLOCK_SIZE),
301ad00d41bSArd Biesheuvel 				   vld1q_u8(permute + 32 - size));
302ad00d41bSArd Biesheuvel 
303ad00d41bSArd Biesheuvel 		st = aegis128_update_neon(st, m);
304ad00d41bSArd Biesheuvel 
305ad00d41bSArd Biesheuvel 		vst1q_u8(out + size - AEGIS_BLOCK_SIZE,
306ad00d41bSArd Biesheuvel 			 vqtbl1q_u8(m, vld1q_u8(permute + size)));
307ad00d41bSArd Biesheuvel 
308ad00d41bSArd Biesheuvel 		if (__builtin_expect(short_input, 0))
309ad00d41bSArd Biesheuvel 			memcpy(dst, out, size);
310ad00d41bSArd Biesheuvel 		else
311ad00d41bSArd Biesheuvel 			vst1q_u8(out - AEGIS_BLOCK_SIZE, msg);
312a4397635SArd Biesheuvel 	}
313a4397635SArd Biesheuvel 
314a4397635SArd Biesheuvel 	aegis128_save_state_neon(st, state);
315a4397635SArd Biesheuvel }
31652828263SArd Biesheuvel 
crypto_aegis128_final_neon(void * state,void * tag_xor,unsigned int assoclen,unsigned int cryptlen,unsigned int authsize)31797b70180SArd Biesheuvel int crypto_aegis128_final_neon(void *state, void *tag_xor,
31897b70180SArd Biesheuvel 			       unsigned int assoclen,
31997b70180SArd Biesheuvel 			       unsigned int cryptlen,
32097b70180SArd Biesheuvel 			       unsigned int authsize)
32152828263SArd Biesheuvel {
32252828263SArd Biesheuvel 	struct aegis128_state st = aegis128_load_state_neon(state);
32352828263SArd Biesheuvel 	uint8x16_t v;
32452828263SArd Biesheuvel 	int i;
32552828263SArd Biesheuvel 
32652828263SArd Biesheuvel 	preload_sbox();
32752828263SArd Biesheuvel 
32897b70180SArd Biesheuvel 	v = st.v[3] ^ (uint8x16_t)vcombine_u64(vmov_n_u64(8ULL * assoclen),
32997b70180SArd Biesheuvel 					       vmov_n_u64(8ULL * cryptlen));
33052828263SArd Biesheuvel 
33152828263SArd Biesheuvel 	for (i = 0; i < 7; i++)
33252828263SArd Biesheuvel 		st = aegis128_update_neon(st, v);
33352828263SArd Biesheuvel 
33497b70180SArd Biesheuvel 	v = st.v[0] ^ st.v[1] ^ st.v[2] ^ st.v[3] ^ st.v[4];
33597b70180SArd Biesheuvel 
33697b70180SArd Biesheuvel 	if (authsize > 0) {
33797b70180SArd Biesheuvel 		v = vqtbl1q_u8(~vceqq_u8(v, vld1q_u8(tag_xor)),
33897b70180SArd Biesheuvel 			       vld1q_u8(permute + authsize));
33997b70180SArd Biesheuvel 
34097b70180SArd Biesheuvel 		return vminvq_s8((int8x16_t)v);
34197b70180SArd Biesheuvel 	}
34297b70180SArd Biesheuvel 
34352828263SArd Biesheuvel 	vst1q_u8(tag_xor, v);
34497b70180SArd Biesheuvel 	return 0;
34552828263SArd Biesheuvel }
346