xref: /openbmc/linux/arch/x86/crypto/chacha-avx2-x86_64.S (revision f97cee494dc92395a668445bcd24d34c89f4ff8c)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
4 *
5 * Copyright (C) 2015 Martin Willi
6 */
7
8#include <linux/linkage.h>
9
10.section	.rodata.cst32.ROT8, "aM", @progbits, 32
11.align 32
12ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
13	.octa 0x0e0d0c0f0a09080b0605040702010003
14
15.section	.rodata.cst32.ROT16, "aM", @progbits, 32
16.align 32
17ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
18	.octa 0x0d0c0f0e09080b0a0504070601000302
19
20.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
21.align 32
22CTRINC:	.octa 0x00000003000000020000000100000000
23	.octa 0x00000007000000060000000500000004
24
25.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
26.align 32
27CTR2BL:	.octa 0x00000000000000000000000000000000
28	.octa 0x00000000000000000000000000000001
29
30.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
31.align 32
32CTR4BL:	.octa 0x00000000000000000000000000000002
33	.octa 0x00000000000000000000000000000003
34
35.text
36
37SYM_FUNC_START(chacha_2block_xor_avx2)
38	# %rdi: Input state matrix, s
39	# %rsi: up to 2 data blocks output, o
40	# %rdx: up to 2 data blocks input, i
41	# %rcx: input/output length in bytes
42	# %r8d: nrounds
43
44	# This function encrypts two ChaCha blocks by loading the state
45	# matrix twice across four AVX registers. It performs matrix operations
46	# on four words in each matrix in parallel, but requires shuffling to
47	# rearrange the words after each round.
48
49	vzeroupper
50
51	# x0..3[0-2] = s0..3
52	vbroadcasti128	0x00(%rdi),%ymm0
53	vbroadcasti128	0x10(%rdi),%ymm1
54	vbroadcasti128	0x20(%rdi),%ymm2
55	vbroadcasti128	0x30(%rdi),%ymm3
56
57	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
58
59	vmovdqa		%ymm0,%ymm8
60	vmovdqa		%ymm1,%ymm9
61	vmovdqa		%ymm2,%ymm10
62	vmovdqa		%ymm3,%ymm11
63
64	vmovdqa		ROT8(%rip),%ymm4
65	vmovdqa		ROT16(%rip),%ymm5
66
67	mov		%rcx,%rax
68
69.Ldoubleround:
70
71	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
72	vpaddd		%ymm1,%ymm0,%ymm0
73	vpxor		%ymm0,%ymm3,%ymm3
74	vpshufb		%ymm5,%ymm3,%ymm3
75
76	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
77	vpaddd		%ymm3,%ymm2,%ymm2
78	vpxor		%ymm2,%ymm1,%ymm1
79	vmovdqa		%ymm1,%ymm6
80	vpslld		$12,%ymm6,%ymm6
81	vpsrld		$20,%ymm1,%ymm1
82	vpor		%ymm6,%ymm1,%ymm1
83
84	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85	vpaddd		%ymm1,%ymm0,%ymm0
86	vpxor		%ymm0,%ymm3,%ymm3
87	vpshufb		%ymm4,%ymm3,%ymm3
88
89	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90	vpaddd		%ymm3,%ymm2,%ymm2
91	vpxor		%ymm2,%ymm1,%ymm1
92	vmovdqa		%ymm1,%ymm7
93	vpslld		$7,%ymm7,%ymm7
94	vpsrld		$25,%ymm1,%ymm1
95	vpor		%ymm7,%ymm1,%ymm1
96
97	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98	vpshufd		$0x39,%ymm1,%ymm1
99	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100	vpshufd		$0x4e,%ymm2,%ymm2
101	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
102	vpshufd		$0x93,%ymm3,%ymm3
103
104	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
105	vpaddd		%ymm1,%ymm0,%ymm0
106	vpxor		%ymm0,%ymm3,%ymm3
107	vpshufb		%ymm5,%ymm3,%ymm3
108
109	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
110	vpaddd		%ymm3,%ymm2,%ymm2
111	vpxor		%ymm2,%ymm1,%ymm1
112	vmovdqa		%ymm1,%ymm6
113	vpslld		$12,%ymm6,%ymm6
114	vpsrld		$20,%ymm1,%ymm1
115	vpor		%ymm6,%ymm1,%ymm1
116
117	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
118	vpaddd		%ymm1,%ymm0,%ymm0
119	vpxor		%ymm0,%ymm3,%ymm3
120	vpshufb		%ymm4,%ymm3,%ymm3
121
122	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
123	vpaddd		%ymm3,%ymm2,%ymm2
124	vpxor		%ymm2,%ymm1,%ymm1
125	vmovdqa		%ymm1,%ymm7
126	vpslld		$7,%ymm7,%ymm7
127	vpsrld		$25,%ymm1,%ymm1
128	vpor		%ymm7,%ymm1,%ymm1
129
130	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
131	vpshufd		$0x93,%ymm1,%ymm1
132	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
133	vpshufd		$0x4e,%ymm2,%ymm2
134	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
135	vpshufd		$0x39,%ymm3,%ymm3
136
137	sub		$2,%r8d
138	jnz		.Ldoubleround
139
140	# o0 = i0 ^ (x0 + s0)
141	vpaddd		%ymm8,%ymm0,%ymm7
142	cmp		$0x10,%rax
143	jl		.Lxorpart2
144	vpxor		0x00(%rdx),%xmm7,%xmm6
145	vmovdqu		%xmm6,0x00(%rsi)
146	vextracti128	$1,%ymm7,%xmm0
147	# o1 = i1 ^ (x1 + s1)
148	vpaddd		%ymm9,%ymm1,%ymm7
149	cmp		$0x20,%rax
150	jl		.Lxorpart2
151	vpxor		0x10(%rdx),%xmm7,%xmm6
152	vmovdqu		%xmm6,0x10(%rsi)
153	vextracti128	$1,%ymm7,%xmm1
154	# o2 = i2 ^ (x2 + s2)
155	vpaddd		%ymm10,%ymm2,%ymm7
156	cmp		$0x30,%rax
157	jl		.Lxorpart2
158	vpxor		0x20(%rdx),%xmm7,%xmm6
159	vmovdqu		%xmm6,0x20(%rsi)
160	vextracti128	$1,%ymm7,%xmm2
161	# o3 = i3 ^ (x3 + s3)
162	vpaddd		%ymm11,%ymm3,%ymm7
163	cmp		$0x40,%rax
164	jl		.Lxorpart2
165	vpxor		0x30(%rdx),%xmm7,%xmm6
166	vmovdqu		%xmm6,0x30(%rsi)
167	vextracti128	$1,%ymm7,%xmm3
168
169	# xor and write second block
170	vmovdqa		%xmm0,%xmm7
171	cmp		$0x50,%rax
172	jl		.Lxorpart2
173	vpxor		0x40(%rdx),%xmm7,%xmm6
174	vmovdqu		%xmm6,0x40(%rsi)
175
176	vmovdqa		%xmm1,%xmm7
177	cmp		$0x60,%rax
178	jl		.Lxorpart2
179	vpxor		0x50(%rdx),%xmm7,%xmm6
180	vmovdqu		%xmm6,0x50(%rsi)
181
182	vmovdqa		%xmm2,%xmm7
183	cmp		$0x70,%rax
184	jl		.Lxorpart2
185	vpxor		0x60(%rdx),%xmm7,%xmm6
186	vmovdqu		%xmm6,0x60(%rsi)
187
188	vmovdqa		%xmm3,%xmm7
189	cmp		$0x80,%rax
190	jl		.Lxorpart2
191	vpxor		0x70(%rdx),%xmm7,%xmm6
192	vmovdqu		%xmm6,0x70(%rsi)
193
194.Ldone2:
195	vzeroupper
196	ret
197
198.Lxorpart2:
199	# xor remaining bytes from partial register into output
200	mov		%rax,%r9
201	and		$0x0f,%r9
202	jz		.Ldone2
203	and		$~0x0f,%rax
204
205	mov		%rsi,%r11
206
207	lea		8(%rsp),%r10
208	sub		$0x10,%rsp
209	and		$~31,%rsp
210
211	lea		(%rdx,%rax),%rsi
212	mov		%rsp,%rdi
213	mov		%r9,%rcx
214	rep movsb
215
216	vpxor		0x00(%rsp),%xmm7,%xmm7
217	vmovdqa		%xmm7,0x00(%rsp)
218
219	mov		%rsp,%rsi
220	lea		(%r11,%rax),%rdi
221	mov		%r9,%rcx
222	rep movsb
223
224	lea		-8(%r10),%rsp
225	jmp		.Ldone2
226
227SYM_FUNC_END(chacha_2block_xor_avx2)
228
229SYM_FUNC_START(chacha_4block_xor_avx2)
230	# %rdi: Input state matrix, s
231	# %rsi: up to 4 data blocks output, o
232	# %rdx: up to 4 data blocks input, i
233	# %rcx: input/output length in bytes
234	# %r8d: nrounds
235
236	# This function encrypts four ChaCha blocks by loading the state
237	# matrix four times across eight AVX registers. It performs matrix
238	# operations on four words in two matrices in parallel, sequentially
239	# to the operations on the four words of the other two matrices. The
240	# required word shuffling has a rather high latency, we can do the
241	# arithmetic on two matrix-pairs without much slowdown.
242
243	vzeroupper
244
245	# x0..3[0-4] = s0..3
246	vbroadcasti128	0x00(%rdi),%ymm0
247	vbroadcasti128	0x10(%rdi),%ymm1
248	vbroadcasti128	0x20(%rdi),%ymm2
249	vbroadcasti128	0x30(%rdi),%ymm3
250
251	vmovdqa		%ymm0,%ymm4
252	vmovdqa		%ymm1,%ymm5
253	vmovdqa		%ymm2,%ymm6
254	vmovdqa		%ymm3,%ymm7
255
256	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
257	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
258
259	vmovdqa		%ymm0,%ymm11
260	vmovdqa		%ymm1,%ymm12
261	vmovdqa		%ymm2,%ymm13
262	vmovdqa		%ymm3,%ymm14
263	vmovdqa		%ymm7,%ymm15
264
265	vmovdqa		ROT8(%rip),%ymm8
266	vmovdqa		ROT16(%rip),%ymm9
267
268	mov		%rcx,%rax
269
270.Ldoubleround4:
271
272	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
273	vpaddd		%ymm1,%ymm0,%ymm0
274	vpxor		%ymm0,%ymm3,%ymm3
275	vpshufb		%ymm9,%ymm3,%ymm3
276
277	vpaddd		%ymm5,%ymm4,%ymm4
278	vpxor		%ymm4,%ymm7,%ymm7
279	vpshufb		%ymm9,%ymm7,%ymm7
280
281	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
282	vpaddd		%ymm3,%ymm2,%ymm2
283	vpxor		%ymm2,%ymm1,%ymm1
284	vmovdqa		%ymm1,%ymm10
285	vpslld		$12,%ymm10,%ymm10
286	vpsrld		$20,%ymm1,%ymm1
287	vpor		%ymm10,%ymm1,%ymm1
288
289	vpaddd		%ymm7,%ymm6,%ymm6
290	vpxor		%ymm6,%ymm5,%ymm5
291	vmovdqa		%ymm5,%ymm10
292	vpslld		$12,%ymm10,%ymm10
293	vpsrld		$20,%ymm5,%ymm5
294	vpor		%ymm10,%ymm5,%ymm5
295
296	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297	vpaddd		%ymm1,%ymm0,%ymm0
298	vpxor		%ymm0,%ymm3,%ymm3
299	vpshufb		%ymm8,%ymm3,%ymm3
300
301	vpaddd		%ymm5,%ymm4,%ymm4
302	vpxor		%ymm4,%ymm7,%ymm7
303	vpshufb		%ymm8,%ymm7,%ymm7
304
305	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306	vpaddd		%ymm3,%ymm2,%ymm2
307	vpxor		%ymm2,%ymm1,%ymm1
308	vmovdqa		%ymm1,%ymm10
309	vpslld		$7,%ymm10,%ymm10
310	vpsrld		$25,%ymm1,%ymm1
311	vpor		%ymm10,%ymm1,%ymm1
312
313	vpaddd		%ymm7,%ymm6,%ymm6
314	vpxor		%ymm6,%ymm5,%ymm5
315	vmovdqa		%ymm5,%ymm10
316	vpslld		$7,%ymm10,%ymm10
317	vpsrld		$25,%ymm5,%ymm5
318	vpor		%ymm10,%ymm5,%ymm5
319
320	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
321	vpshufd		$0x39,%ymm1,%ymm1
322	vpshufd		$0x39,%ymm5,%ymm5
323	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
324	vpshufd		$0x4e,%ymm2,%ymm2
325	vpshufd		$0x4e,%ymm6,%ymm6
326	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
327	vpshufd		$0x93,%ymm3,%ymm3
328	vpshufd		$0x93,%ymm7,%ymm7
329
330	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
331	vpaddd		%ymm1,%ymm0,%ymm0
332	vpxor		%ymm0,%ymm3,%ymm3
333	vpshufb		%ymm9,%ymm3,%ymm3
334
335	vpaddd		%ymm5,%ymm4,%ymm4
336	vpxor		%ymm4,%ymm7,%ymm7
337	vpshufb		%ymm9,%ymm7,%ymm7
338
339	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
340	vpaddd		%ymm3,%ymm2,%ymm2
341	vpxor		%ymm2,%ymm1,%ymm1
342	vmovdqa		%ymm1,%ymm10
343	vpslld		$12,%ymm10,%ymm10
344	vpsrld		$20,%ymm1,%ymm1
345	vpor		%ymm10,%ymm1,%ymm1
346
347	vpaddd		%ymm7,%ymm6,%ymm6
348	vpxor		%ymm6,%ymm5,%ymm5
349	vmovdqa		%ymm5,%ymm10
350	vpslld		$12,%ymm10,%ymm10
351	vpsrld		$20,%ymm5,%ymm5
352	vpor		%ymm10,%ymm5,%ymm5
353
354	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
355	vpaddd		%ymm1,%ymm0,%ymm0
356	vpxor		%ymm0,%ymm3,%ymm3
357	vpshufb		%ymm8,%ymm3,%ymm3
358
359	vpaddd		%ymm5,%ymm4,%ymm4
360	vpxor		%ymm4,%ymm7,%ymm7
361	vpshufb		%ymm8,%ymm7,%ymm7
362
363	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
364	vpaddd		%ymm3,%ymm2,%ymm2
365	vpxor		%ymm2,%ymm1,%ymm1
366	vmovdqa		%ymm1,%ymm10
367	vpslld		$7,%ymm10,%ymm10
368	vpsrld		$25,%ymm1,%ymm1
369	vpor		%ymm10,%ymm1,%ymm1
370
371	vpaddd		%ymm7,%ymm6,%ymm6
372	vpxor		%ymm6,%ymm5,%ymm5
373	vmovdqa		%ymm5,%ymm10
374	vpslld		$7,%ymm10,%ymm10
375	vpsrld		$25,%ymm5,%ymm5
376	vpor		%ymm10,%ymm5,%ymm5
377
378	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
379	vpshufd		$0x93,%ymm1,%ymm1
380	vpshufd		$0x93,%ymm5,%ymm5
381	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
382	vpshufd		$0x4e,%ymm2,%ymm2
383	vpshufd		$0x4e,%ymm6,%ymm6
384	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
385	vpshufd		$0x39,%ymm3,%ymm3
386	vpshufd		$0x39,%ymm7,%ymm7
387
388	sub		$2,%r8d
389	jnz		.Ldoubleround4
390
391	# o0 = i0 ^ (x0 + s0), first block
392	vpaddd		%ymm11,%ymm0,%ymm10
393	cmp		$0x10,%rax
394	jl		.Lxorpart4
395	vpxor		0x00(%rdx),%xmm10,%xmm9
396	vmovdqu		%xmm9,0x00(%rsi)
397	vextracti128	$1,%ymm10,%xmm0
398	# o1 = i1 ^ (x1 + s1), first block
399	vpaddd		%ymm12,%ymm1,%ymm10
400	cmp		$0x20,%rax
401	jl		.Lxorpart4
402	vpxor		0x10(%rdx),%xmm10,%xmm9
403	vmovdqu		%xmm9,0x10(%rsi)
404	vextracti128	$1,%ymm10,%xmm1
405	# o2 = i2 ^ (x2 + s2), first block
406	vpaddd		%ymm13,%ymm2,%ymm10
407	cmp		$0x30,%rax
408	jl		.Lxorpart4
409	vpxor		0x20(%rdx),%xmm10,%xmm9
410	vmovdqu		%xmm9,0x20(%rsi)
411	vextracti128	$1,%ymm10,%xmm2
412	# o3 = i3 ^ (x3 + s3), first block
413	vpaddd		%ymm14,%ymm3,%ymm10
414	cmp		$0x40,%rax
415	jl		.Lxorpart4
416	vpxor		0x30(%rdx),%xmm10,%xmm9
417	vmovdqu		%xmm9,0x30(%rsi)
418	vextracti128	$1,%ymm10,%xmm3
419
420	# xor and write second block
421	vmovdqa		%xmm0,%xmm10
422	cmp		$0x50,%rax
423	jl		.Lxorpart4
424	vpxor		0x40(%rdx),%xmm10,%xmm9
425	vmovdqu		%xmm9,0x40(%rsi)
426
427	vmovdqa		%xmm1,%xmm10
428	cmp		$0x60,%rax
429	jl		.Lxorpart4
430	vpxor		0x50(%rdx),%xmm10,%xmm9
431	vmovdqu		%xmm9,0x50(%rsi)
432
433	vmovdqa		%xmm2,%xmm10
434	cmp		$0x70,%rax
435	jl		.Lxorpart4
436	vpxor		0x60(%rdx),%xmm10,%xmm9
437	vmovdqu		%xmm9,0x60(%rsi)
438
439	vmovdqa		%xmm3,%xmm10
440	cmp		$0x80,%rax
441	jl		.Lxorpart4
442	vpxor		0x70(%rdx),%xmm10,%xmm9
443	vmovdqu		%xmm9,0x70(%rsi)
444
445	# o0 = i0 ^ (x0 + s0), third block
446	vpaddd		%ymm11,%ymm4,%ymm10
447	cmp		$0x90,%rax
448	jl		.Lxorpart4
449	vpxor		0x80(%rdx),%xmm10,%xmm9
450	vmovdqu		%xmm9,0x80(%rsi)
451	vextracti128	$1,%ymm10,%xmm4
452	# o1 = i1 ^ (x1 + s1), third block
453	vpaddd		%ymm12,%ymm5,%ymm10
454	cmp		$0xa0,%rax
455	jl		.Lxorpart4
456	vpxor		0x90(%rdx),%xmm10,%xmm9
457	vmovdqu		%xmm9,0x90(%rsi)
458	vextracti128	$1,%ymm10,%xmm5
459	# o2 = i2 ^ (x2 + s2), third block
460	vpaddd		%ymm13,%ymm6,%ymm10
461	cmp		$0xb0,%rax
462	jl		.Lxorpart4
463	vpxor		0xa0(%rdx),%xmm10,%xmm9
464	vmovdqu		%xmm9,0xa0(%rsi)
465	vextracti128	$1,%ymm10,%xmm6
466	# o3 = i3 ^ (x3 + s3), third block
467	vpaddd		%ymm15,%ymm7,%ymm10
468	cmp		$0xc0,%rax
469	jl		.Lxorpart4
470	vpxor		0xb0(%rdx),%xmm10,%xmm9
471	vmovdqu		%xmm9,0xb0(%rsi)
472	vextracti128	$1,%ymm10,%xmm7
473
474	# xor and write fourth block
475	vmovdqa		%xmm4,%xmm10
476	cmp		$0xd0,%rax
477	jl		.Lxorpart4
478	vpxor		0xc0(%rdx),%xmm10,%xmm9
479	vmovdqu		%xmm9,0xc0(%rsi)
480
481	vmovdqa		%xmm5,%xmm10
482	cmp		$0xe0,%rax
483	jl		.Lxorpart4
484	vpxor		0xd0(%rdx),%xmm10,%xmm9
485	vmovdqu		%xmm9,0xd0(%rsi)
486
487	vmovdqa		%xmm6,%xmm10
488	cmp		$0xf0,%rax
489	jl		.Lxorpart4
490	vpxor		0xe0(%rdx),%xmm10,%xmm9
491	vmovdqu		%xmm9,0xe0(%rsi)
492
493	vmovdqa		%xmm7,%xmm10
494	cmp		$0x100,%rax
495	jl		.Lxorpart4
496	vpxor		0xf0(%rdx),%xmm10,%xmm9
497	vmovdqu		%xmm9,0xf0(%rsi)
498
499.Ldone4:
500	vzeroupper
501	ret
502
503.Lxorpart4:
504	# xor remaining bytes from partial register into output
505	mov		%rax,%r9
506	and		$0x0f,%r9
507	jz		.Ldone4
508	and		$~0x0f,%rax
509
510	mov		%rsi,%r11
511
512	lea		8(%rsp),%r10
513	sub		$0x10,%rsp
514	and		$~31,%rsp
515
516	lea		(%rdx,%rax),%rsi
517	mov		%rsp,%rdi
518	mov		%r9,%rcx
519	rep movsb
520
521	vpxor		0x00(%rsp),%xmm10,%xmm10
522	vmovdqa		%xmm10,0x00(%rsp)
523
524	mov		%rsp,%rsi
525	lea		(%r11,%rax),%rdi
526	mov		%r9,%rcx
527	rep movsb
528
529	lea		-8(%r10),%rsp
530	jmp		.Ldone4
531
532SYM_FUNC_END(chacha_4block_xor_avx2)
533
534SYM_FUNC_START(chacha_8block_xor_avx2)
535	# %rdi: Input state matrix, s
536	# %rsi: up to 8 data blocks output, o
537	# %rdx: up to 8 data blocks input, i
538	# %rcx: input/output length in bytes
539	# %r8d: nrounds
540
541	# This function encrypts eight consecutive ChaCha blocks by loading
542	# the state matrix in AVX registers eight times. As we need some
543	# scratch registers, we save the first four registers on the stack. The
544	# algorithm performs each operation on the corresponding word of each
545	# state matrix, hence requires no word shuffling. For final XORing step
546	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
547	# words, which allows us to do XOR in AVX registers. 8/16-bit word
548	# rotation is done with the slightly better performing byte shuffling,
549	# 7/12-bit word rotation uses traditional shift+OR.
550
551	vzeroupper
552	# 4 * 32 byte stack, 32-byte aligned
553	lea		8(%rsp),%r10
554	and		$~31, %rsp
555	sub		$0x80, %rsp
556	mov		%rcx,%rax
557
558	# x0..15[0-7] = s[0..15]
559	vpbroadcastd	0x00(%rdi),%ymm0
560	vpbroadcastd	0x04(%rdi),%ymm1
561	vpbroadcastd	0x08(%rdi),%ymm2
562	vpbroadcastd	0x0c(%rdi),%ymm3
563	vpbroadcastd	0x10(%rdi),%ymm4
564	vpbroadcastd	0x14(%rdi),%ymm5
565	vpbroadcastd	0x18(%rdi),%ymm6
566	vpbroadcastd	0x1c(%rdi),%ymm7
567	vpbroadcastd	0x20(%rdi),%ymm8
568	vpbroadcastd	0x24(%rdi),%ymm9
569	vpbroadcastd	0x28(%rdi),%ymm10
570	vpbroadcastd	0x2c(%rdi),%ymm11
571	vpbroadcastd	0x30(%rdi),%ymm12
572	vpbroadcastd	0x34(%rdi),%ymm13
573	vpbroadcastd	0x38(%rdi),%ymm14
574	vpbroadcastd	0x3c(%rdi),%ymm15
575	# x0..3 on stack
576	vmovdqa		%ymm0,0x00(%rsp)
577	vmovdqa		%ymm1,0x20(%rsp)
578	vmovdqa		%ymm2,0x40(%rsp)
579	vmovdqa		%ymm3,0x60(%rsp)
580
581	vmovdqa		CTRINC(%rip),%ymm1
582	vmovdqa		ROT8(%rip),%ymm2
583	vmovdqa		ROT16(%rip),%ymm3
584
585	# x12 += counter values 0-3
586	vpaddd		%ymm1,%ymm12,%ymm12
587
588.Ldoubleround8:
589	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
590	vpaddd		0x00(%rsp),%ymm4,%ymm0
591	vmovdqa		%ymm0,0x00(%rsp)
592	vpxor		%ymm0,%ymm12,%ymm12
593	vpshufb		%ymm3,%ymm12,%ymm12
594	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
595	vpaddd		0x20(%rsp),%ymm5,%ymm0
596	vmovdqa		%ymm0,0x20(%rsp)
597	vpxor		%ymm0,%ymm13,%ymm13
598	vpshufb		%ymm3,%ymm13,%ymm13
599	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
600	vpaddd		0x40(%rsp),%ymm6,%ymm0
601	vmovdqa		%ymm0,0x40(%rsp)
602	vpxor		%ymm0,%ymm14,%ymm14
603	vpshufb		%ymm3,%ymm14,%ymm14
604	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
605	vpaddd		0x60(%rsp),%ymm7,%ymm0
606	vmovdqa		%ymm0,0x60(%rsp)
607	vpxor		%ymm0,%ymm15,%ymm15
608	vpshufb		%ymm3,%ymm15,%ymm15
609
610	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
611	vpaddd		%ymm12,%ymm8,%ymm8
612	vpxor		%ymm8,%ymm4,%ymm4
613	vpslld		$12,%ymm4,%ymm0
614	vpsrld		$20,%ymm4,%ymm4
615	vpor		%ymm0,%ymm4,%ymm4
616	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
617	vpaddd		%ymm13,%ymm9,%ymm9
618	vpxor		%ymm9,%ymm5,%ymm5
619	vpslld		$12,%ymm5,%ymm0
620	vpsrld		$20,%ymm5,%ymm5
621	vpor		%ymm0,%ymm5,%ymm5
622	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
623	vpaddd		%ymm14,%ymm10,%ymm10
624	vpxor		%ymm10,%ymm6,%ymm6
625	vpslld		$12,%ymm6,%ymm0
626	vpsrld		$20,%ymm6,%ymm6
627	vpor		%ymm0,%ymm6,%ymm6
628	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
629	vpaddd		%ymm15,%ymm11,%ymm11
630	vpxor		%ymm11,%ymm7,%ymm7
631	vpslld		$12,%ymm7,%ymm0
632	vpsrld		$20,%ymm7,%ymm7
633	vpor		%ymm0,%ymm7,%ymm7
634
635	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
636	vpaddd		0x00(%rsp),%ymm4,%ymm0
637	vmovdqa		%ymm0,0x00(%rsp)
638	vpxor		%ymm0,%ymm12,%ymm12
639	vpshufb		%ymm2,%ymm12,%ymm12
640	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
641	vpaddd		0x20(%rsp),%ymm5,%ymm0
642	vmovdqa		%ymm0,0x20(%rsp)
643	vpxor		%ymm0,%ymm13,%ymm13
644	vpshufb		%ymm2,%ymm13,%ymm13
645	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
646	vpaddd		0x40(%rsp),%ymm6,%ymm0
647	vmovdqa		%ymm0,0x40(%rsp)
648	vpxor		%ymm0,%ymm14,%ymm14
649	vpshufb		%ymm2,%ymm14,%ymm14
650	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
651	vpaddd		0x60(%rsp),%ymm7,%ymm0
652	vmovdqa		%ymm0,0x60(%rsp)
653	vpxor		%ymm0,%ymm15,%ymm15
654	vpshufb		%ymm2,%ymm15,%ymm15
655
656	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
657	vpaddd		%ymm12,%ymm8,%ymm8
658	vpxor		%ymm8,%ymm4,%ymm4
659	vpslld		$7,%ymm4,%ymm0
660	vpsrld		$25,%ymm4,%ymm4
661	vpor		%ymm0,%ymm4,%ymm4
662	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
663	vpaddd		%ymm13,%ymm9,%ymm9
664	vpxor		%ymm9,%ymm5,%ymm5
665	vpslld		$7,%ymm5,%ymm0
666	vpsrld		$25,%ymm5,%ymm5
667	vpor		%ymm0,%ymm5,%ymm5
668	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
669	vpaddd		%ymm14,%ymm10,%ymm10
670	vpxor		%ymm10,%ymm6,%ymm6
671	vpslld		$7,%ymm6,%ymm0
672	vpsrld		$25,%ymm6,%ymm6
673	vpor		%ymm0,%ymm6,%ymm6
674	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
675	vpaddd		%ymm15,%ymm11,%ymm11
676	vpxor		%ymm11,%ymm7,%ymm7
677	vpslld		$7,%ymm7,%ymm0
678	vpsrld		$25,%ymm7,%ymm7
679	vpor		%ymm0,%ymm7,%ymm7
680
681	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
682	vpaddd		0x00(%rsp),%ymm5,%ymm0
683	vmovdqa		%ymm0,0x00(%rsp)
684	vpxor		%ymm0,%ymm15,%ymm15
685	vpshufb		%ymm3,%ymm15,%ymm15
686	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
687	vpaddd		0x20(%rsp),%ymm6,%ymm0
688	vmovdqa		%ymm0,0x20(%rsp)
689	vpxor		%ymm0,%ymm12,%ymm12
690	vpshufb		%ymm3,%ymm12,%ymm12
691	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
692	vpaddd		0x40(%rsp),%ymm7,%ymm0
693	vmovdqa		%ymm0,0x40(%rsp)
694	vpxor		%ymm0,%ymm13,%ymm13
695	vpshufb		%ymm3,%ymm13,%ymm13
696	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
697	vpaddd		0x60(%rsp),%ymm4,%ymm0
698	vmovdqa		%ymm0,0x60(%rsp)
699	vpxor		%ymm0,%ymm14,%ymm14
700	vpshufb		%ymm3,%ymm14,%ymm14
701
702	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
703	vpaddd		%ymm15,%ymm10,%ymm10
704	vpxor		%ymm10,%ymm5,%ymm5
705	vpslld		$12,%ymm5,%ymm0
706	vpsrld		$20,%ymm5,%ymm5
707	vpor		%ymm0,%ymm5,%ymm5
708	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
709	vpaddd		%ymm12,%ymm11,%ymm11
710	vpxor		%ymm11,%ymm6,%ymm6
711	vpslld		$12,%ymm6,%ymm0
712	vpsrld		$20,%ymm6,%ymm6
713	vpor		%ymm0,%ymm6,%ymm6
714	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
715	vpaddd		%ymm13,%ymm8,%ymm8
716	vpxor		%ymm8,%ymm7,%ymm7
717	vpslld		$12,%ymm7,%ymm0
718	vpsrld		$20,%ymm7,%ymm7
719	vpor		%ymm0,%ymm7,%ymm7
720	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
721	vpaddd		%ymm14,%ymm9,%ymm9
722	vpxor		%ymm9,%ymm4,%ymm4
723	vpslld		$12,%ymm4,%ymm0
724	vpsrld		$20,%ymm4,%ymm4
725	vpor		%ymm0,%ymm4,%ymm4
726
727	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
728	vpaddd		0x00(%rsp),%ymm5,%ymm0
729	vmovdqa		%ymm0,0x00(%rsp)
730	vpxor		%ymm0,%ymm15,%ymm15
731	vpshufb		%ymm2,%ymm15,%ymm15
732	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
733	vpaddd		0x20(%rsp),%ymm6,%ymm0
734	vmovdqa		%ymm0,0x20(%rsp)
735	vpxor		%ymm0,%ymm12,%ymm12
736	vpshufb		%ymm2,%ymm12,%ymm12
737	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
738	vpaddd		0x40(%rsp),%ymm7,%ymm0
739	vmovdqa		%ymm0,0x40(%rsp)
740	vpxor		%ymm0,%ymm13,%ymm13
741	vpshufb		%ymm2,%ymm13,%ymm13
742	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
743	vpaddd		0x60(%rsp),%ymm4,%ymm0
744	vmovdqa		%ymm0,0x60(%rsp)
745	vpxor		%ymm0,%ymm14,%ymm14
746	vpshufb		%ymm2,%ymm14,%ymm14
747
748	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
749	vpaddd		%ymm15,%ymm10,%ymm10
750	vpxor		%ymm10,%ymm5,%ymm5
751	vpslld		$7,%ymm5,%ymm0
752	vpsrld		$25,%ymm5,%ymm5
753	vpor		%ymm0,%ymm5,%ymm5
754	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
755	vpaddd		%ymm12,%ymm11,%ymm11
756	vpxor		%ymm11,%ymm6,%ymm6
757	vpslld		$7,%ymm6,%ymm0
758	vpsrld		$25,%ymm6,%ymm6
759	vpor		%ymm0,%ymm6,%ymm6
760	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
761	vpaddd		%ymm13,%ymm8,%ymm8
762	vpxor		%ymm8,%ymm7,%ymm7
763	vpslld		$7,%ymm7,%ymm0
764	vpsrld		$25,%ymm7,%ymm7
765	vpor		%ymm0,%ymm7,%ymm7
766	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
767	vpaddd		%ymm14,%ymm9,%ymm9
768	vpxor		%ymm9,%ymm4,%ymm4
769	vpslld		$7,%ymm4,%ymm0
770	vpsrld		$25,%ymm4,%ymm4
771	vpor		%ymm0,%ymm4,%ymm4
772
773	sub		$2,%r8d
774	jnz		.Ldoubleround8
775
776	# x0..15[0-3] += s[0..15]
777	vpbroadcastd	0x00(%rdi),%ymm0
778	vpaddd		0x00(%rsp),%ymm0,%ymm0
779	vmovdqa		%ymm0,0x00(%rsp)
780	vpbroadcastd	0x04(%rdi),%ymm0
781	vpaddd		0x20(%rsp),%ymm0,%ymm0
782	vmovdqa		%ymm0,0x20(%rsp)
783	vpbroadcastd	0x08(%rdi),%ymm0
784	vpaddd		0x40(%rsp),%ymm0,%ymm0
785	vmovdqa		%ymm0,0x40(%rsp)
786	vpbroadcastd	0x0c(%rdi),%ymm0
787	vpaddd		0x60(%rsp),%ymm0,%ymm0
788	vmovdqa		%ymm0,0x60(%rsp)
789	vpbroadcastd	0x10(%rdi),%ymm0
790	vpaddd		%ymm0,%ymm4,%ymm4
791	vpbroadcastd	0x14(%rdi),%ymm0
792	vpaddd		%ymm0,%ymm5,%ymm5
793	vpbroadcastd	0x18(%rdi),%ymm0
794	vpaddd		%ymm0,%ymm6,%ymm6
795	vpbroadcastd	0x1c(%rdi),%ymm0
796	vpaddd		%ymm0,%ymm7,%ymm7
797	vpbroadcastd	0x20(%rdi),%ymm0
798	vpaddd		%ymm0,%ymm8,%ymm8
799	vpbroadcastd	0x24(%rdi),%ymm0
800	vpaddd		%ymm0,%ymm9,%ymm9
801	vpbroadcastd	0x28(%rdi),%ymm0
802	vpaddd		%ymm0,%ymm10,%ymm10
803	vpbroadcastd	0x2c(%rdi),%ymm0
804	vpaddd		%ymm0,%ymm11,%ymm11
805	vpbroadcastd	0x30(%rdi),%ymm0
806	vpaddd		%ymm0,%ymm12,%ymm12
807	vpbroadcastd	0x34(%rdi),%ymm0
808	vpaddd		%ymm0,%ymm13,%ymm13
809	vpbroadcastd	0x38(%rdi),%ymm0
810	vpaddd		%ymm0,%ymm14,%ymm14
811	vpbroadcastd	0x3c(%rdi),%ymm0
812	vpaddd		%ymm0,%ymm15,%ymm15
813
814	# x12 += counter values 0-3
815	vpaddd		%ymm1,%ymm12,%ymm12
816
817	# interleave 32-bit words in state n, n+1
818	vmovdqa		0x00(%rsp),%ymm0
819	vmovdqa		0x20(%rsp),%ymm1
820	vpunpckldq	%ymm1,%ymm0,%ymm2
821	vpunpckhdq	%ymm1,%ymm0,%ymm1
822	vmovdqa		%ymm2,0x00(%rsp)
823	vmovdqa		%ymm1,0x20(%rsp)
824	vmovdqa		0x40(%rsp),%ymm0
825	vmovdqa		0x60(%rsp),%ymm1
826	vpunpckldq	%ymm1,%ymm0,%ymm2
827	vpunpckhdq	%ymm1,%ymm0,%ymm1
828	vmovdqa		%ymm2,0x40(%rsp)
829	vmovdqa		%ymm1,0x60(%rsp)
830	vmovdqa		%ymm4,%ymm0
831	vpunpckldq	%ymm5,%ymm0,%ymm4
832	vpunpckhdq	%ymm5,%ymm0,%ymm5
833	vmovdqa		%ymm6,%ymm0
834	vpunpckldq	%ymm7,%ymm0,%ymm6
835	vpunpckhdq	%ymm7,%ymm0,%ymm7
836	vmovdqa		%ymm8,%ymm0
837	vpunpckldq	%ymm9,%ymm0,%ymm8
838	vpunpckhdq	%ymm9,%ymm0,%ymm9
839	vmovdqa		%ymm10,%ymm0
840	vpunpckldq	%ymm11,%ymm0,%ymm10
841	vpunpckhdq	%ymm11,%ymm0,%ymm11
842	vmovdqa		%ymm12,%ymm0
843	vpunpckldq	%ymm13,%ymm0,%ymm12
844	vpunpckhdq	%ymm13,%ymm0,%ymm13
845	vmovdqa		%ymm14,%ymm0
846	vpunpckldq	%ymm15,%ymm0,%ymm14
847	vpunpckhdq	%ymm15,%ymm0,%ymm15
848
849	# interleave 64-bit words in state n, n+2
850	vmovdqa		0x00(%rsp),%ymm0
851	vmovdqa		0x40(%rsp),%ymm2
852	vpunpcklqdq	%ymm2,%ymm0,%ymm1
853	vpunpckhqdq	%ymm2,%ymm0,%ymm2
854	vmovdqa		%ymm1,0x00(%rsp)
855	vmovdqa		%ymm2,0x40(%rsp)
856	vmovdqa		0x20(%rsp),%ymm0
857	vmovdqa		0x60(%rsp),%ymm2
858	vpunpcklqdq	%ymm2,%ymm0,%ymm1
859	vpunpckhqdq	%ymm2,%ymm0,%ymm2
860	vmovdqa		%ymm1,0x20(%rsp)
861	vmovdqa		%ymm2,0x60(%rsp)
862	vmovdqa		%ymm4,%ymm0
863	vpunpcklqdq	%ymm6,%ymm0,%ymm4
864	vpunpckhqdq	%ymm6,%ymm0,%ymm6
865	vmovdqa		%ymm5,%ymm0
866	vpunpcklqdq	%ymm7,%ymm0,%ymm5
867	vpunpckhqdq	%ymm7,%ymm0,%ymm7
868	vmovdqa		%ymm8,%ymm0
869	vpunpcklqdq	%ymm10,%ymm0,%ymm8
870	vpunpckhqdq	%ymm10,%ymm0,%ymm10
871	vmovdqa		%ymm9,%ymm0
872	vpunpcklqdq	%ymm11,%ymm0,%ymm9
873	vpunpckhqdq	%ymm11,%ymm0,%ymm11
874	vmovdqa		%ymm12,%ymm0
875	vpunpcklqdq	%ymm14,%ymm0,%ymm12
876	vpunpckhqdq	%ymm14,%ymm0,%ymm14
877	vmovdqa		%ymm13,%ymm0
878	vpunpcklqdq	%ymm15,%ymm0,%ymm13
879	vpunpckhqdq	%ymm15,%ymm0,%ymm15
880
881	# interleave 128-bit words in state n, n+4
882	# xor/write first four blocks
883	vmovdqa		0x00(%rsp),%ymm1
884	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
885	cmp		$0x0020,%rax
886	jl		.Lxorpart8
887	vpxor		0x0000(%rdx),%ymm0,%ymm0
888	vmovdqu		%ymm0,0x0000(%rsi)
889	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
890
891	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
892	cmp		$0x0040,%rax
893	jl		.Lxorpart8
894	vpxor		0x0020(%rdx),%ymm0,%ymm0
895	vmovdqu		%ymm0,0x0020(%rsi)
896	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
897
898	vmovdqa		0x40(%rsp),%ymm1
899	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
900	cmp		$0x0060,%rax
901	jl		.Lxorpart8
902	vpxor		0x0040(%rdx),%ymm0,%ymm0
903	vmovdqu		%ymm0,0x0040(%rsi)
904	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
905
906	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
907	cmp		$0x0080,%rax
908	jl		.Lxorpart8
909	vpxor		0x0060(%rdx),%ymm0,%ymm0
910	vmovdqu		%ymm0,0x0060(%rsi)
911	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
912
913	vmovdqa		0x20(%rsp),%ymm1
914	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
915	cmp		$0x00a0,%rax
916	jl		.Lxorpart8
917	vpxor		0x0080(%rdx),%ymm0,%ymm0
918	vmovdqu		%ymm0,0x0080(%rsi)
919	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
920
921	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
922	cmp		$0x00c0,%rax
923	jl		.Lxorpart8
924	vpxor		0x00a0(%rdx),%ymm0,%ymm0
925	vmovdqu		%ymm0,0x00a0(%rsi)
926	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
927
928	vmovdqa		0x60(%rsp),%ymm1
929	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
930	cmp		$0x00e0,%rax
931	jl		.Lxorpart8
932	vpxor		0x00c0(%rdx),%ymm0,%ymm0
933	vmovdqu		%ymm0,0x00c0(%rsi)
934	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
935
936	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
937	cmp		$0x0100,%rax
938	jl		.Lxorpart8
939	vpxor		0x00e0(%rdx),%ymm0,%ymm0
940	vmovdqu		%ymm0,0x00e0(%rsi)
941	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
942
943	# xor remaining blocks, write to output
944	vmovdqa		%ymm4,%ymm0
945	cmp		$0x0120,%rax
946	jl		.Lxorpart8
947	vpxor		0x0100(%rdx),%ymm0,%ymm0
948	vmovdqu		%ymm0,0x0100(%rsi)
949
950	vmovdqa		%ymm12,%ymm0
951	cmp		$0x0140,%rax
952	jl		.Lxorpart8
953	vpxor		0x0120(%rdx),%ymm0,%ymm0
954	vmovdqu		%ymm0,0x0120(%rsi)
955
956	vmovdqa		%ymm6,%ymm0
957	cmp		$0x0160,%rax
958	jl		.Lxorpart8
959	vpxor		0x0140(%rdx),%ymm0,%ymm0
960	vmovdqu		%ymm0,0x0140(%rsi)
961
962	vmovdqa		%ymm14,%ymm0
963	cmp		$0x0180,%rax
964	jl		.Lxorpart8
965	vpxor		0x0160(%rdx),%ymm0,%ymm0
966	vmovdqu		%ymm0,0x0160(%rsi)
967
968	vmovdqa		%ymm5,%ymm0
969	cmp		$0x01a0,%rax
970	jl		.Lxorpart8
971	vpxor		0x0180(%rdx),%ymm0,%ymm0
972	vmovdqu		%ymm0,0x0180(%rsi)
973
974	vmovdqa		%ymm13,%ymm0
975	cmp		$0x01c0,%rax
976	jl		.Lxorpart8
977	vpxor		0x01a0(%rdx),%ymm0,%ymm0
978	vmovdqu		%ymm0,0x01a0(%rsi)
979
980	vmovdqa		%ymm7,%ymm0
981	cmp		$0x01e0,%rax
982	jl		.Lxorpart8
983	vpxor		0x01c0(%rdx),%ymm0,%ymm0
984	vmovdqu		%ymm0,0x01c0(%rsi)
985
986	vmovdqa		%ymm15,%ymm0
987	cmp		$0x0200,%rax
988	jl		.Lxorpart8
989	vpxor		0x01e0(%rdx),%ymm0,%ymm0
990	vmovdqu		%ymm0,0x01e0(%rsi)
991
992.Ldone8:
993	vzeroupper
994	lea		-8(%r10),%rsp
995	ret
996
997.Lxorpart8:
998	# xor remaining bytes from partial register into output
999	mov		%rax,%r9
1000	and		$0x1f,%r9
1001	jz		.Ldone8
1002	and		$~0x1f,%rax
1003
1004	mov		%rsi,%r11
1005
1006	lea		(%rdx,%rax),%rsi
1007	mov		%rsp,%rdi
1008	mov		%r9,%rcx
1009	rep movsb
1010
1011	vpxor		0x00(%rsp),%ymm0,%ymm0
1012	vmovdqa		%ymm0,0x00(%rsp)
1013
1014	mov		%rsp,%rsi
1015	lea		(%r11,%rax),%rdi
1016	mov		%r9,%rcx
1017	rep movsb
1018
1019	jmp		.Ldone8
1020
1021SYM_FUNC_END(chacha_8block_xor_avx2)
1022