xref: /openbmc/linux/arch/arm64/crypto/ghash-ce-core.S (revision 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4  *
5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7 
8 #include <linux/linkage.h>
9 #include <linux/cfi_types.h>
10 #include <asm/assembler.h>
11 
12 	SHASH		.req	v0
13 	SHASH2		.req	v1
14 	T1		.req	v2
15 	T2		.req	v3
16 	MASK		.req	v4
17 	XM		.req	v5
18 	XL		.req	v6
19 	XH		.req	v7
20 	IN1		.req	v7
21 
22 	k00_16		.req	v8
23 	k32_48		.req	v9
24 
25 	t3		.req	v10
26 	t4		.req	v11
27 	t5		.req	v12
28 	t6		.req	v13
29 	t7		.req	v14
30 	t8		.req	v15
31 	t9		.req	v16
32 
33 	perm1		.req	v17
34 	perm2		.req	v18
35 	perm3		.req	v19
36 
37 	sh1		.req	v20
38 	sh2		.req	v21
39 	sh3		.req	v22
40 	sh4		.req	v23
41 
42 	ss1		.req	v24
43 	ss2		.req	v25
44 	ss3		.req	v26
45 	ss4		.req	v27
46 
47 	XL2		.req	v8
48 	XM2		.req	v9
49 	XH2		.req	v10
50 	XL3		.req	v11
51 	XM3		.req	v12
52 	XH3		.req	v13
53 	TT3		.req	v14
54 	TT4		.req	v15
55 	HH		.req	v16
56 	HH3		.req	v17
57 	HH4		.req	v18
58 	HH34		.req	v19
59 
60 	.text
61 	.arch		armv8-a+crypto
62 
63 	.macro		__pmull_p64, rd, rn, rm
64 	pmull		\rd\().1q, \rn\().1d, \rm\().1d
65 	.endm
66 
67 	.macro		__pmull2_p64, rd, rn, rm
68 	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
69 	.endm
70 
71 	.macro		__pmull_p8, rq, ad, bd
72 	ext		t3.8b, \ad\().8b, \ad\().8b, #1		// A1
73 	ext		t5.8b, \ad\().8b, \ad\().8b, #2		// A2
74 	ext		t7.8b, \ad\().8b, \ad\().8b, #3		// A3
75 
76 	__pmull_p8_\bd	\rq, \ad
77 	.endm
78 
79 	.macro		__pmull2_p8, rq, ad, bd
80 	tbl		t3.16b, {\ad\().16b}, perm1.16b		// A1
81 	tbl		t5.16b, {\ad\().16b}, perm2.16b		// A2
82 	tbl		t7.16b, {\ad\().16b}, perm3.16b		// A3
83 
84 	__pmull2_p8_\bd	\rq, \ad
85 	.endm
86 
87 	.macro		__pmull_p8_SHASH, rq, ad
88 	__pmull_p8_tail	\rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
89 	.endm
90 
91 	.macro		__pmull_p8_SHASH2, rq, ad
92 	__pmull_p8_tail	\rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
93 	.endm
94 
95 	.macro		__pmull2_p8_SHASH, rq, ad
96 	__pmull_p8_tail	\rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
97 	.endm
98 
99 	.macro		__pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100 	pmull\t		t3.8h, t3.\nb, \bd			// F = A1*B
101 	pmull\t		t4.8h, \ad, \b1\().\nb			// E = A*B1
102 	pmull\t		t5.8h, t5.\nb, \bd			// H = A2*B
103 	pmull\t		t6.8h, \ad, \b2\().\nb			// G = A*B2
104 	pmull\t		t7.8h, t7.\nb, \bd			// J = A3*B
105 	pmull\t		t8.8h, \ad, \b3\().\nb			// I = A*B3
106 	pmull\t		t9.8h, \ad, \b4\().\nb			// K = A*B4
107 	pmull\t		\rq\().8h, \ad, \bd			// D = A*B
108 
109 	eor		t3.16b, t3.16b, t4.16b			// L = E + F
110 	eor		t5.16b, t5.16b, t6.16b			// M = G + H
111 	eor		t7.16b, t7.16b, t8.16b			// N = I + J
112 
113 	uzp1		t4.2d, t3.2d, t5.2d
114 	uzp2		t3.2d, t3.2d, t5.2d
115 	uzp1		t6.2d, t7.2d, t9.2d
116 	uzp2		t7.2d, t7.2d, t9.2d
117 
118 	// t3 = (L) (P0 + P1) << 8
119 	// t5 = (M) (P2 + P3) << 16
120 	eor		t4.16b, t4.16b, t3.16b
121 	and		t3.16b, t3.16b, k32_48.16b
122 
123 	// t7 = (N) (P4 + P5) << 24
124 	// t9 = (K) (P6 + P7) << 32
125 	eor		t6.16b, t6.16b, t7.16b
126 	and		t7.16b, t7.16b, k00_16.16b
127 
128 	eor		t4.16b, t4.16b, t3.16b
129 	eor		t6.16b, t6.16b, t7.16b
130 
131 	zip2		t5.2d, t4.2d, t3.2d
132 	zip1		t3.2d, t4.2d, t3.2d
133 	zip2		t9.2d, t6.2d, t7.2d
134 	zip1		t7.2d, t6.2d, t7.2d
135 
136 	ext		t3.16b, t3.16b, t3.16b, #15
137 	ext		t5.16b, t5.16b, t5.16b, #14
138 	ext		t7.16b, t7.16b, t7.16b, #13
139 	ext		t9.16b, t9.16b, t9.16b, #12
140 
141 	eor		t3.16b, t3.16b, t5.16b
142 	eor		t7.16b, t7.16b, t9.16b
143 	eor		\rq\().16b, \rq\().16b, t3.16b
144 	eor		\rq\().16b, \rq\().16b, t7.16b
145 	.endm
146 
147 	.macro		__pmull_pre_p64
148 	add		x8, x3, #16
149 	ld1		{HH.2d-HH4.2d}, [x8]
150 
151 	trn1		SHASH2.2d, SHASH.2d, HH.2d
152 	trn2		T1.2d, SHASH.2d, HH.2d
153 	eor		SHASH2.16b, SHASH2.16b, T1.16b
154 
155 	trn1		HH34.2d, HH3.2d, HH4.2d
156 	trn2		T1.2d, HH3.2d, HH4.2d
157 	eor		HH34.16b, HH34.16b, T1.16b
158 
159 	movi		MASK.16b, #0xe1
160 	shl		MASK.2d, MASK.2d, #57
161 	.endm
162 
163 	.macro		__pmull_pre_p8
164 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
165 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
166 
167 	// k00_16 := 0x0000000000000000_000000000000ffff
168 	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
169 	movi		k32_48.2d, #0xffffffff
170 	mov		k32_48.h[2], k32_48.h[0]
171 	ushr		k00_16.2d, k32_48.2d, #32
172 
173 	// prepare the permutation vectors
174 	mov_q		x5, 0x080f0e0d0c0b0a09
175 	movi		T1.8b, #8
176 	dup		perm1.2d, x5
177 	eor		perm1.16b, perm1.16b, T1.16b
178 	ushr		perm2.2d, perm1.2d, #8
179 	ushr		perm3.2d, perm1.2d, #16
180 	ushr		T1.2d, perm1.2d, #24
181 	sli		perm2.2d, perm1.2d, #56
182 	sli		perm3.2d, perm1.2d, #48
183 	sli		T1.2d, perm1.2d, #40
184 
185 	// precompute loop invariants
186 	tbl		sh1.16b, {SHASH.16b}, perm1.16b
187 	tbl		sh2.16b, {SHASH.16b}, perm2.16b
188 	tbl		sh3.16b, {SHASH.16b}, perm3.16b
189 	tbl		sh4.16b, {SHASH.16b}, T1.16b
190 	ext		ss1.8b, SHASH2.8b, SHASH2.8b, #1
191 	ext		ss2.8b, SHASH2.8b, SHASH2.8b, #2
192 	ext		ss3.8b, SHASH2.8b, SHASH2.8b, #3
193 	ext		ss4.8b, SHASH2.8b, SHASH2.8b, #4
194 	.endm
195 
196 	//
197 	// PMULL (64x64->128) based reduction for CPUs that can do
198 	// it in a single instruction.
199 	//
200 	.macro		__pmull_reduce_p64
201 	pmull		T2.1q, XL.1d, MASK.1d
202 	eor		XM.16b, XM.16b, T1.16b
203 
204 	mov		XH.d[0], XM.d[1]
205 	mov		XM.d[1], XL.d[0]
206 
207 	eor		XL.16b, XM.16b, T2.16b
208 	ext		T2.16b, XL.16b, XL.16b, #8
209 	pmull		XL.1q, XL.1d, MASK.1d
210 	.endm
211 
212 	//
213 	// Alternative reduction for CPUs that lack support for the
214 	// 64x64->128 PMULL instruction
215 	//
216 	.macro		__pmull_reduce_p8
217 	eor		XM.16b, XM.16b, T1.16b
218 
219 	mov		XL.d[1], XM.d[0]
220 	mov		XH.d[0], XM.d[1]
221 
222 	shl		T1.2d, XL.2d, #57
223 	shl		T2.2d, XL.2d, #62
224 	eor		T2.16b, T2.16b, T1.16b
225 	shl		T1.2d, XL.2d, #63
226 	eor		T2.16b, T2.16b, T1.16b
227 	ext		T1.16b, XL.16b, XH.16b, #8
228 	eor		T2.16b, T2.16b, T1.16b
229 
230 	mov		XL.d[1], T2.d[0]
231 	mov		XH.d[0], T2.d[1]
232 
233 	ushr		T2.2d, XL.2d, #1
234 	eor		XH.16b, XH.16b, XL.16b
235 	eor		XL.16b, XL.16b, T2.16b
236 	ushr		T2.2d, T2.2d, #6
237 	ushr		XL.2d, XL.2d, #1
238 	.endm
239 
240 	.macro		__pmull_ghash, pn
241 	ld1		{SHASH.2d}, [x3]
242 	ld1		{XL.2d}, [x1]
243 
244 	__pmull_pre_\pn
245 
246 	/* do the head block first, if supplied */
247 	cbz		x4, 0f
248 	ld1		{T1.2d}, [x4]
249 	mov		x4, xzr
250 	b		3f
251 
252 0:	.ifc		\pn, p64
253 	tbnz		w0, #0, 2f		// skip until #blocks is a
254 	tbnz		w0, #1, 2f		// round multiple of 4
255 
256 1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
257 
258 	sub		w0, w0, #4
259 
260 	rev64		T1.16b, XM3.16b
261 	rev64		T2.16b, XH3.16b
262 	rev64		TT4.16b, TT4.16b
263 	rev64		TT3.16b, TT3.16b
264 
265 	ext		IN1.16b, TT4.16b, TT4.16b, #8
266 	ext		XL3.16b, TT3.16b, TT3.16b, #8
267 
268 	eor		TT4.16b, TT4.16b, IN1.16b
269 	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
270 	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
271 	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
272 
273 	eor		TT3.16b, TT3.16b, XL3.16b
274 	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
275 	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
276 	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
277 
278 	ext		IN1.16b, T2.16b, T2.16b, #8
279 	eor		XL2.16b, XL2.16b, XL3.16b
280 	eor		XH2.16b, XH2.16b, XH3.16b
281 	eor		XM2.16b, XM2.16b, XM3.16b
282 
283 	eor		T2.16b, T2.16b, IN1.16b
284 	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
285 	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
286 	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
287 
288 	eor		XL2.16b, XL2.16b, XL3.16b
289 	eor		XH2.16b, XH2.16b, XH3.16b
290 	eor		XM2.16b, XM2.16b, XM3.16b
291 
292 	ext		IN1.16b, T1.16b, T1.16b, #8
293 	ext		TT3.16b, XL.16b, XL.16b, #8
294 	eor		XL.16b, XL.16b, IN1.16b
295 	eor		T1.16b, T1.16b, TT3.16b
296 
297 	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
298 	eor		T1.16b, T1.16b, XL.16b
299 	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
300 	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
301 
302 	eor		XL.16b, XL.16b, XL2.16b
303 	eor		XH.16b, XH.16b, XH2.16b
304 	eor		XM.16b, XM.16b, XM2.16b
305 
306 	eor		T2.16b, XL.16b, XH.16b
307 	ext		T1.16b, XL.16b, XH.16b, #8
308 	eor		XM.16b, XM.16b, T2.16b
309 
310 	__pmull_reduce_p64
311 
312 	eor		T2.16b, T2.16b, XH.16b
313 	eor		XL.16b, XL.16b, T2.16b
314 
315 	cbz		w0, 5f
316 	b		1b
317 	.endif
318 
319 2:	ld1		{T1.2d}, [x2], #16
320 	sub		w0, w0, #1
321 
322 3:	/* multiply XL by SHASH in GF(2^128) */
323 CPU_LE(	rev64		T1.16b, T1.16b	)
324 
325 	ext		T2.16b, XL.16b, XL.16b, #8
326 	ext		IN1.16b, T1.16b, T1.16b, #8
327 	eor		T1.16b, T1.16b, T2.16b
328 	eor		XL.16b, XL.16b, IN1.16b
329 
330 	__pmull2_\pn	XH, XL, SHASH			// a1 * b1
331 	eor		T1.16b, T1.16b, XL.16b
332 	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
333 	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)
334 
335 4:	eor		T2.16b, XL.16b, XH.16b
336 	ext		T1.16b, XL.16b, XH.16b, #8
337 	eor		XM.16b, XM.16b, T2.16b
338 
339 	__pmull_reduce_\pn
340 
341 	eor		T2.16b, T2.16b, XH.16b
342 	eor		XL.16b, XL.16b, T2.16b
343 
344 	cbnz		w0, 0b
345 
346 5:	st1		{XL.2d}, [x1]
347 	ret
348 	.endm
349 
350 	/*
351 	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352 	 *			   struct ghash_key const *k, const char *head)
353 	 */
354 SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355 	__pmull_ghash	p64
356 SYM_FUNC_END(pmull_ghash_update_p64)
357 
358 SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359 	__pmull_ghash	p8
360 SYM_FUNC_END(pmull_ghash_update_p8)
361 
362 	KS0		.req	v8
363 	KS1		.req	v9
364 	KS2		.req	v10
365 	KS3		.req	v11
366 
367 	INP0		.req	v21
368 	INP1		.req	v22
369 	INP2		.req	v23
370 	INP3		.req	v24
371 
372 	K0		.req	v25
373 	K1		.req	v26
374 	K2		.req	v27
375 	K3		.req	v28
376 	K4		.req	v12
377 	K5		.req	v13
378 	K6		.req	v4
379 	K7		.req	v5
380 	K8		.req	v14
381 	K9		.req	v15
382 	KK		.req	v29
383 	KL		.req	v30
384 	KM		.req	v31
385 
386 	.macro		load_round_keys, rounds, rk, tmp
387 	add		\tmp, \rk, #64
388 	ld1		{K0.4s-K3.4s}, [\rk]
389 	ld1		{K4.4s-K5.4s}, [\tmp]
390 	add		\tmp, \rk, \rounds, lsl #4
391 	sub		\tmp, \tmp, #32
392 	ld1		{KK.4s-KM.4s}, [\tmp]
393 	.endm
394 
395 	.macro		enc_round, state, key
396 	aese		\state\().16b, \key\().16b
397 	aesmc		\state\().16b, \state\().16b
398 	.endm
399 
400 	.macro		enc_qround, s0, s1, s2, s3, key
401 	enc_round	\s0, \key
402 	enc_round	\s1, \key
403 	enc_round	\s2, \key
404 	enc_round	\s3, \key
405 	.endm
406 
407 	.macro		enc_block, state, rounds, rk, tmp
408 	add		\tmp, \rk, #96
409 	ld1		{K6.4s-K7.4s}, [\tmp], #32
410 	.irp		key, K0, K1, K2, K3, K4 K5
411 	enc_round	\state, \key
412 	.endr
413 
414 	tbnz		\rounds, #2, .Lnot128_\@
415 .Lout256_\@:
416 	enc_round	\state, K6
417 	enc_round	\state, K7
418 
419 .Lout192_\@:
420 	enc_round	\state, KK
421 	aese		\state\().16b, KL.16b
422 	eor		\state\().16b, \state\().16b, KM.16b
423 
424 	.subsection	1
425 .Lnot128_\@:
426 	ld1		{K8.4s-K9.4s}, [\tmp], #32
427 	enc_round	\state, K6
428 	enc_round	\state, K7
429 	ld1		{K6.4s-K7.4s}, [\tmp]
430 	enc_round	\state, K8
431 	enc_round	\state, K9
432 	tbz		\rounds, #1, .Lout192_\@
433 	b		.Lout256_\@
434 	.previous
435 	.endm
436 
437 	.align		6
438 	.macro		pmull_gcm_do_crypt, enc
439 	frame_push	1
440 
441 	load_round_keys	x7, x6, x8
442 
443 	ld1		{SHASH.2d}, [x3], #16
444 	ld1		{HH.2d-HH4.2d}, [x3]
445 
446 	trn1		SHASH2.2d, SHASH.2d, HH.2d
447 	trn2		T1.2d, SHASH.2d, HH.2d
448 	eor		SHASH2.16b, SHASH2.16b, T1.16b
449 
450 	trn1		HH34.2d, HH3.2d, HH4.2d
451 	trn2		T1.2d, HH3.2d, HH4.2d
452 	eor		HH34.16b, HH34.16b, T1.16b
453 
454 	ld1		{XL.2d}, [x4]
455 
456 	cbz		x0, 3f				// tag only?
457 
458 	ldr		w8, [x5, #12]			// load lower counter
459 CPU_LE(	rev		w8, w8		)
460 
461 0:	mov		w9, #4				// max blocks per round
462 	add		x10, x0, #0xf
463 	lsr		x10, x10, #4			// remaining blocks
464 
465 	subs		x0, x0, #64
466 	csel		w9, w10, w9, mi
467 	add		w8, w8, w9
468 
469 	bmi		1f
470 	ld1		{INP0.16b-INP3.16b}, [x2], #64
471 	.subsection	1
472 	/*
473 	 * Populate the four input registers right to left with up to 63 bytes
474 	 * of data, using overlapping loads to avoid branches.
475 	 *
476 	 *                INP0     INP1     INP2     INP3
477 	 *  1 byte     |        |        |        |x       |
478 	 * 16 bytes    |        |        |        |xxxxxxxx|
479 	 * 17 bytes    |        |        |xxxxxxxx|x       |
480 	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
481 	 * etc etc
482 	 *
483 	 * Note that this code may read up to 15 bytes before the start of
484 	 * the input. It is up to the calling code to ensure this is safe if
485 	 * this happens in the first iteration of the loop (i.e., when the
486 	 * input size is < 16 bytes)
487 	 */
488 1:	mov		x15, #16
489 	ands		x19, x0, #0xf
490 	csel		x19, x19, x15, ne
491 	adr_l		x17, .Lpermute_table + 16
492 
493 	sub		x11, x15, x19
494 	add		x12, x17, x11
495 	sub		x17, x17, x11
496 	ld1		{T1.16b}, [x12]
497 	sub		x10, x1, x11
498 	sub		x11, x2, x11
499 
500 	cmp		x0, #-16
501 	csel		x14, x15, xzr, gt
502 	cmp		x0, #-32
503 	csel		x15, x15, xzr, gt
504 	cmp		x0, #-48
505 	csel		x16, x19, xzr, gt
506 	csel		x1, x1, x10, gt
507 	csel		x2, x2, x11, gt
508 
509 	ld1		{INP0.16b}, [x2], x14
510 	ld1		{INP1.16b}, [x2], x15
511 	ld1		{INP2.16b}, [x2], x16
512 	ld1		{INP3.16b}, [x2]
513 	tbl		INP3.16b, {INP3.16b}, T1.16b
514 	b		2f
515 	.previous
516 
517 2:	.if		\enc == 0
518 	bl		pmull_gcm_ghash_4x
519 	.endif
520 
521 	bl		pmull_gcm_enc_4x
522 
523 	tbnz		x0, #63, 6f
524 	st1		{INP0.16b-INP3.16b}, [x1], #64
525 	.if		\enc == 1
526 	bl		pmull_gcm_ghash_4x
527 	.endif
528 	bne		0b
529 
530 3:	ldr		x10, [sp, #.Lframe_local_offset]
531 	cbz		x10, 5f				// output tag?
532 
533 	ld1		{INP3.16b}, [x10]		// load lengths[]
534 	mov		w9, #1
535 	bl		pmull_gcm_ghash_4x
536 
537 	mov		w11, #(0x1 << 24)		// BE '1U'
538 	ld1		{KS0.16b}, [x5]
539 	mov		KS0.s[3], w11
540 
541 	enc_block	KS0, x7, x6, x12
542 
543 	ext		XL.16b, XL.16b, XL.16b, #8
544 	rev64		XL.16b, XL.16b
545 	eor		XL.16b, XL.16b, KS0.16b
546 
547 	.if		\enc == 1
548 	st1		{XL.16b}, [x10]			// store tag
549 	.else
550 	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
551 	adr_l		x17, .Lpermute_table
552 	ld1		{KS0.16b}, [x11]		// load supplied tag
553 	add		x17, x17, x12
554 	ld1		{KS1.16b}, [x17]		// load permute vector
555 
556 	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
557 	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
558 	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
559 	sminv		b0, XL.16b			// signed minimum across XL
560 	smov		w0, v0.b[0]			// return b0
561 	.endif
562 
563 4:	frame_pop
564 	ret
565 
566 5:
567 CPU_LE(	rev		w8, w8		)
568 	str		w8, [x5, #12]			// store lower counter
569 	st1		{XL.2d}, [x4]
570 	b		4b
571 
572 6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
573 	sub		x17, x17, x19, lsl #1
574 
575 	cmp		w9, #1
576 	beq		7f
577 	.subsection	1
578 7:	ld1		{INP2.16b}, [x1]
579 	tbx		INP2.16b, {INP3.16b}, T1.16b
580 	mov		INP3.16b, INP2.16b
581 	b		8f
582 	.previous
583 
584 	st1		{INP0.16b}, [x1], x14
585 	st1		{INP1.16b}, [x1], x15
586 	st1		{INP2.16b}, [x1], x16
587 	tbl		INP3.16b, {INP3.16b}, T1.16b
588 	tbx		INP3.16b, {INP2.16b}, T2.16b
589 8:	st1		{INP3.16b}, [x1]
590 
591 	.if		\enc == 1
592 	ld1		{T1.16b}, [x17]
593 	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
594 	bl		pmull_gcm_ghash_4x
595 	.endif
596 	b		3b
597 	.endm
598 
599 	/*
600 	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601 	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
602 	 *			  int rounds, u8 tag)
603 	 */
604 SYM_FUNC_START(pmull_gcm_encrypt)
605 	pmull_gcm_do_crypt	1
606 SYM_FUNC_END(pmull_gcm_encrypt)
607 
608 	/*
609 	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610 	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
611 	 *			  int rounds, u8 tag)
612 	 */
613 SYM_FUNC_START(pmull_gcm_decrypt)
614 	pmull_gcm_do_crypt	0
615 SYM_FUNC_END(pmull_gcm_decrypt)
616 
617 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618 	movi		MASK.16b, #0xe1
619 	shl		MASK.2d, MASK.2d, #57
620 
621 	rev64		T1.16b, INP0.16b
622 	rev64		T2.16b, INP1.16b
623 	rev64		TT3.16b, INP2.16b
624 	rev64		TT4.16b, INP3.16b
625 
626 	ext		XL.16b, XL.16b, XL.16b, #8
627 
628 	tbz		w9, #2, 0f			// <4 blocks?
629 	.subsection	1
630 0:	movi		XH2.16b, #0
631 	movi		XM2.16b, #0
632 	movi		XL2.16b, #0
633 
634 	tbz		w9, #0, 1f			// 2 blocks?
635 	tbz		w9, #1, 2f			// 1 block?
636 
637 	eor		T2.16b, T2.16b, XL.16b
638 	ext		T1.16b, T2.16b, T2.16b, #8
639 	b		.Lgh3
640 
641 1:	eor		TT3.16b, TT3.16b, XL.16b
642 	ext		T2.16b, TT3.16b, TT3.16b, #8
643 	b		.Lgh2
644 
645 2:	eor		TT4.16b, TT4.16b, XL.16b
646 	ext		IN1.16b, TT4.16b, TT4.16b, #8
647 	b		.Lgh1
648 	.previous
649 
650 	eor		T1.16b, T1.16b, XL.16b
651 	ext		IN1.16b, T1.16b, T1.16b, #8
652 
653 	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
654 	eor		T1.16b, T1.16b, IN1.16b
655 	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
656 	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
657 
658 	ext		T1.16b, T2.16b, T2.16b, #8
659 .Lgh3:	eor		T2.16b, T2.16b, T1.16b
660 	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
661 	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
662 	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
663 
664 	eor		XH2.16b, XH2.16b, XH.16b
665 	eor		XL2.16b, XL2.16b, XL.16b
666 	eor		XM2.16b, XM2.16b, XM.16b
667 
668 	ext		T2.16b, TT3.16b, TT3.16b, #8
669 .Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
670 	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
671 	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
672 	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
673 
674 	eor		XH2.16b, XH2.16b, XH.16b
675 	eor		XL2.16b, XL2.16b, XL.16b
676 	eor		XM2.16b, XM2.16b, XM.16b
677 
678 	ext		IN1.16b, TT4.16b, TT4.16b, #8
679 .Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
680 	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
681 	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
682 	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
683 
684 	eor		XH.16b, XH.16b, XH2.16b
685 	eor		XL.16b, XL.16b, XL2.16b
686 	eor		XM.16b, XM.16b, XM2.16b
687 
688 	eor		T2.16b, XL.16b, XH.16b
689 	ext		T1.16b, XL.16b, XH.16b, #8
690 	eor		XM.16b, XM.16b, T2.16b
691 
692 	__pmull_reduce_p64
693 
694 	eor		T2.16b, T2.16b, XH.16b
695 	eor		XL.16b, XL.16b, T2.16b
696 
697 	ret
698 SYM_FUNC_END(pmull_gcm_ghash_4x)
699 
700 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701 	ld1		{KS0.16b}, [x5]			// load upper counter
702 	sub		w10, w8, #4
703 	sub		w11, w8, #3
704 	sub		w12, w8, #2
705 	sub		w13, w8, #1
706 	rev		w10, w10
707 	rev		w11, w11
708 	rev		w12, w12
709 	rev		w13, w13
710 	mov		KS1.16b, KS0.16b
711 	mov		KS2.16b, KS0.16b
712 	mov		KS3.16b, KS0.16b
713 	ins		KS0.s[3], w10			// set lower counter
714 	ins		KS1.s[3], w11
715 	ins		KS2.s[3], w12
716 	ins		KS3.s[3], w13
717 
718 	add		x10, x6, #96			// round key pointer
719 	ld1		{K6.4s-K7.4s}, [x10], #32
720 	.irp		key, K0, K1, K2, K3, K4, K5
721 	enc_qround	KS0, KS1, KS2, KS3, \key
722 	.endr
723 
724 	tbnz		x7, #2, .Lnot128
725 	.subsection	1
726 .Lnot128:
727 	ld1		{K8.4s-K9.4s}, [x10], #32
728 	.irp		key, K6, K7
729 	enc_qround	KS0, KS1, KS2, KS3, \key
730 	.endr
731 	ld1		{K6.4s-K7.4s}, [x10]
732 	.irp		key, K8, K9
733 	enc_qround	KS0, KS1, KS2, KS3, \key
734 	.endr
735 	tbz		x7, #1, .Lout192
736 	b		.Lout256
737 	.previous
738 
739 .Lout256:
740 	.irp		key, K6, K7
741 	enc_qround	KS0, KS1, KS2, KS3, \key
742 	.endr
743 
744 .Lout192:
745 	enc_qround	KS0, KS1, KS2, KS3, KK
746 
747 	aese		KS0.16b, KL.16b
748 	aese		KS1.16b, KL.16b
749 	aese		KS2.16b, KL.16b
750 	aese		KS3.16b, KL.16b
751 
752 	eor		KS0.16b, KS0.16b, KM.16b
753 	eor		KS1.16b, KS1.16b, KM.16b
754 	eor		KS2.16b, KS2.16b, KM.16b
755 	eor		KS3.16b, KS3.16b, KM.16b
756 
757 	eor		INP0.16b, INP0.16b, KS0.16b
758 	eor		INP1.16b, INP1.16b, KS1.16b
759 	eor		INP2.16b, INP2.16b, KS2.16b
760 	eor		INP3.16b, INP3.16b, KS3.16b
761 
762 	ret
763 SYM_FUNC_END(pmull_gcm_enc_4x)
764 
765 	.section	".rodata", "a"
766 	.align		6
767 .Lpermute_table:
768 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
771 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
772 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773 	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
775 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
776 	.previous
777