1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
4 *
5 * Copyright (C) 2015 Martin Willi
6 */
7
8#include <linux/linkage.h>
9#include <asm/frame.h>
10
11.section	.rodata.cst16.ROT8, "aM", @progbits, 16
12.align 16
13ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
14.section	.rodata.cst16.ROT16, "aM", @progbits, 16
15.align 16
16ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
17.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
18.align 16
19CTRINC:	.octa 0x00000003000000020000000100000000
20
21.text
22
23/*
24 * chacha_permute - permute one block
25 *
26 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
27 * function performs matrix operations on four words in parallel, but requires
28 * shuffling to rearrange the words after each round.  8/16-bit word rotation is
29 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
30 * rotation uses traditional shift+OR.
31 *
32 * The round count is given in %r8d.
33 *
34 * Clobbers: %r8d, %xmm4-%xmm7
35 */
36chacha_permute:
37
38	movdqa		ROT8(%rip),%xmm4
39	movdqa		ROT16(%rip),%xmm5
40
41.Ldoubleround:
42	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
43	paddd		%xmm1,%xmm0
44	pxor		%xmm0,%xmm3
45	pshufb		%xmm5,%xmm3
46
47	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
48	paddd		%xmm3,%xmm2
49	pxor		%xmm2,%xmm1
50	movdqa		%xmm1,%xmm6
51	pslld		$12,%xmm6
52	psrld		$20,%xmm1
53	por		%xmm6,%xmm1
54
55	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
56	paddd		%xmm1,%xmm0
57	pxor		%xmm0,%xmm3
58	pshufb		%xmm4,%xmm3
59
60	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
61	paddd		%xmm3,%xmm2
62	pxor		%xmm2,%xmm1
63	movdqa		%xmm1,%xmm7
64	pslld		$7,%xmm7
65	psrld		$25,%xmm1
66	por		%xmm7,%xmm1
67
68	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
69	pshufd		$0x39,%xmm1,%xmm1
70	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
71	pshufd		$0x4e,%xmm2,%xmm2
72	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
73	pshufd		$0x93,%xmm3,%xmm3
74
75	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
76	paddd		%xmm1,%xmm0
77	pxor		%xmm0,%xmm3
78	pshufb		%xmm5,%xmm3
79
80	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
81	paddd		%xmm3,%xmm2
82	pxor		%xmm2,%xmm1
83	movdqa		%xmm1,%xmm6
84	pslld		$12,%xmm6
85	psrld		$20,%xmm1
86	por		%xmm6,%xmm1
87
88	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
89	paddd		%xmm1,%xmm0
90	pxor		%xmm0,%xmm3
91	pshufb		%xmm4,%xmm3
92
93	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
94	paddd		%xmm3,%xmm2
95	pxor		%xmm2,%xmm1
96	movdqa		%xmm1,%xmm7
97	pslld		$7,%xmm7
98	psrld		$25,%xmm1
99	por		%xmm7,%xmm1
100
101	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
102	pshufd		$0x93,%xmm1,%xmm1
103	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104	pshufd		$0x4e,%xmm2,%xmm2
105	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
106	pshufd		$0x39,%xmm3,%xmm3
107
108	sub		$2,%r8d
109	jnz		.Ldoubleround
110
111	ret
112ENDPROC(chacha_permute)
113
114ENTRY(chacha_block_xor_ssse3)
115	# %rdi: Input state matrix, s
116	# %rsi: up to 1 data block output, o
117	# %rdx: up to 1 data block input, i
118	# %rcx: input/output length in bytes
119	# %r8d: nrounds
120	FRAME_BEGIN
121
122	# x0..3 = s0..3
123	movdqa		0x00(%rdi),%xmm0
124	movdqa		0x10(%rdi),%xmm1
125	movdqa		0x20(%rdi),%xmm2
126	movdqa		0x30(%rdi),%xmm3
127	movdqa		%xmm0,%xmm8
128	movdqa		%xmm1,%xmm9
129	movdqa		%xmm2,%xmm10
130	movdqa		%xmm3,%xmm11
131
132	mov		%rcx,%rax
133	call		chacha_permute
134
135	# o0 = i0 ^ (x0 + s0)
136	paddd		%xmm8,%xmm0
137	cmp		$0x10,%rax
138	jl		.Lxorpart
139	movdqu		0x00(%rdx),%xmm4
140	pxor		%xmm4,%xmm0
141	movdqu		%xmm0,0x00(%rsi)
142	# o1 = i1 ^ (x1 + s1)
143	paddd		%xmm9,%xmm1
144	movdqa		%xmm1,%xmm0
145	cmp		$0x20,%rax
146	jl		.Lxorpart
147	movdqu		0x10(%rdx),%xmm0
148	pxor		%xmm1,%xmm0
149	movdqu		%xmm0,0x10(%rsi)
150	# o2 = i2 ^ (x2 + s2)
151	paddd		%xmm10,%xmm2
152	movdqa		%xmm2,%xmm0
153	cmp		$0x30,%rax
154	jl		.Lxorpart
155	movdqu		0x20(%rdx),%xmm0
156	pxor		%xmm2,%xmm0
157	movdqu		%xmm0,0x20(%rsi)
158	# o3 = i3 ^ (x3 + s3)
159	paddd		%xmm11,%xmm3
160	movdqa		%xmm3,%xmm0
161	cmp		$0x40,%rax
162	jl		.Lxorpart
163	movdqu		0x30(%rdx),%xmm0
164	pxor		%xmm3,%xmm0
165	movdqu		%xmm0,0x30(%rsi)
166
167.Ldone:
168	FRAME_END
169	ret
170
171.Lxorpart:
172	# xor remaining bytes from partial register into output
173	mov		%rax,%r9
174	and		$0x0f,%r9
175	jz		.Ldone
176	and		$~0x0f,%rax
177
178	mov		%rsi,%r11
179
180	lea		8(%rsp),%r10
181	sub		$0x10,%rsp
182	and		$~31,%rsp
183
184	lea		(%rdx,%rax),%rsi
185	mov		%rsp,%rdi
186	mov		%r9,%rcx
187	rep movsb
188
189	pxor		0x00(%rsp),%xmm0
190	movdqa		%xmm0,0x00(%rsp)
191
192	mov		%rsp,%rsi
193	lea		(%r11,%rax),%rdi
194	mov		%r9,%rcx
195	rep movsb
196
197	lea		-8(%r10),%rsp
198	jmp		.Ldone
199
200ENDPROC(chacha_block_xor_ssse3)
201
202ENTRY(hchacha_block_ssse3)
203	# %rdi: Input state matrix, s
204	# %rsi: output (8 32-bit words)
205	# %edx: nrounds
206	FRAME_BEGIN
207
208	movdqa		0x00(%rdi),%xmm0
209	movdqa		0x10(%rdi),%xmm1
210	movdqa		0x20(%rdi),%xmm2
211	movdqa		0x30(%rdi),%xmm3
212
213	mov		%edx,%r8d
214	call		chacha_permute
215
216	movdqu		%xmm0,0x00(%rsi)
217	movdqu		%xmm3,0x10(%rsi)
218
219	FRAME_END
220	ret
221ENDPROC(hchacha_block_ssse3)
222
223ENTRY(chacha_4block_xor_ssse3)
224	# %rdi: Input state matrix, s
225	# %rsi: up to 4 data blocks output, o
226	# %rdx: up to 4 data blocks input, i
227	# %rcx: input/output length in bytes
228	# %r8d: nrounds
229
230	# This function encrypts four consecutive ChaCha blocks by loading the
231	# the state matrix in SSE registers four times. As we need some scratch
232	# registers, we save the first four registers on the stack. The
233	# algorithm performs each operation on the corresponding word of each
234	# state matrix, hence requires no word shuffling. For final XORing step
235	# we transpose the matrix by interleaving 32- and then 64-bit words,
236	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
237	# done with the slightly better performing SSSE3 byte shuffling,
238	# 7/12-bit word rotation uses traditional shift+OR.
239
240	lea		8(%rsp),%r10
241	sub		$0x80,%rsp
242	and		$~63,%rsp
243	mov		%rcx,%rax
244
245	# x0..15[0-3] = s0..3[0..3]
246	movq		0x00(%rdi),%xmm1
247	pshufd		$0x00,%xmm1,%xmm0
248	pshufd		$0x55,%xmm1,%xmm1
249	movq		0x08(%rdi),%xmm3
250	pshufd		$0x00,%xmm3,%xmm2
251	pshufd		$0x55,%xmm3,%xmm3
252	movq		0x10(%rdi),%xmm5
253	pshufd		$0x00,%xmm5,%xmm4
254	pshufd		$0x55,%xmm5,%xmm5
255	movq		0x18(%rdi),%xmm7
256	pshufd		$0x00,%xmm7,%xmm6
257	pshufd		$0x55,%xmm7,%xmm7
258	movq		0x20(%rdi),%xmm9
259	pshufd		$0x00,%xmm9,%xmm8
260	pshufd		$0x55,%xmm9,%xmm9
261	movq		0x28(%rdi),%xmm11
262	pshufd		$0x00,%xmm11,%xmm10
263	pshufd		$0x55,%xmm11,%xmm11
264	movq		0x30(%rdi),%xmm13
265	pshufd		$0x00,%xmm13,%xmm12
266	pshufd		$0x55,%xmm13,%xmm13
267	movq		0x38(%rdi),%xmm15
268	pshufd		$0x00,%xmm15,%xmm14
269	pshufd		$0x55,%xmm15,%xmm15
270	# x0..3 on stack
271	movdqa		%xmm0,0x00(%rsp)
272	movdqa		%xmm1,0x10(%rsp)
273	movdqa		%xmm2,0x20(%rsp)
274	movdqa		%xmm3,0x30(%rsp)
275
276	movdqa		CTRINC(%rip),%xmm1
277	movdqa		ROT8(%rip),%xmm2
278	movdqa		ROT16(%rip),%xmm3
279
280	# x12 += counter values 0-3
281	paddd		%xmm1,%xmm12
282
283.Ldoubleround4:
284	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
285	movdqa		0x00(%rsp),%xmm0
286	paddd		%xmm4,%xmm0
287	movdqa		%xmm0,0x00(%rsp)
288	pxor		%xmm0,%xmm12
289	pshufb		%xmm3,%xmm12
290	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
291	movdqa		0x10(%rsp),%xmm0
292	paddd		%xmm5,%xmm0
293	movdqa		%xmm0,0x10(%rsp)
294	pxor		%xmm0,%xmm13
295	pshufb		%xmm3,%xmm13
296	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
297	movdqa		0x20(%rsp),%xmm0
298	paddd		%xmm6,%xmm0
299	movdqa		%xmm0,0x20(%rsp)
300	pxor		%xmm0,%xmm14
301	pshufb		%xmm3,%xmm14
302	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
303	movdqa		0x30(%rsp),%xmm0
304	paddd		%xmm7,%xmm0
305	movdqa		%xmm0,0x30(%rsp)
306	pxor		%xmm0,%xmm15
307	pshufb		%xmm3,%xmm15
308
309	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
310	paddd		%xmm12,%xmm8
311	pxor		%xmm8,%xmm4
312	movdqa		%xmm4,%xmm0
313	pslld		$12,%xmm0
314	psrld		$20,%xmm4
315	por		%xmm0,%xmm4
316	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
317	paddd		%xmm13,%xmm9
318	pxor		%xmm9,%xmm5
319	movdqa		%xmm5,%xmm0
320	pslld		$12,%xmm0
321	psrld		$20,%xmm5
322	por		%xmm0,%xmm5
323	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
324	paddd		%xmm14,%xmm10
325	pxor		%xmm10,%xmm6
326	movdqa		%xmm6,%xmm0
327	pslld		$12,%xmm0
328	psrld		$20,%xmm6
329	por		%xmm0,%xmm6
330	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
331	paddd		%xmm15,%xmm11
332	pxor		%xmm11,%xmm7
333	movdqa		%xmm7,%xmm0
334	pslld		$12,%xmm0
335	psrld		$20,%xmm7
336	por		%xmm0,%xmm7
337
338	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
339	movdqa		0x00(%rsp),%xmm0
340	paddd		%xmm4,%xmm0
341	movdqa		%xmm0,0x00(%rsp)
342	pxor		%xmm0,%xmm12
343	pshufb		%xmm2,%xmm12
344	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
345	movdqa		0x10(%rsp),%xmm0
346	paddd		%xmm5,%xmm0
347	movdqa		%xmm0,0x10(%rsp)
348	pxor		%xmm0,%xmm13
349	pshufb		%xmm2,%xmm13
350	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
351	movdqa		0x20(%rsp),%xmm0
352	paddd		%xmm6,%xmm0
353	movdqa		%xmm0,0x20(%rsp)
354	pxor		%xmm0,%xmm14
355	pshufb		%xmm2,%xmm14
356	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
357	movdqa		0x30(%rsp),%xmm0
358	paddd		%xmm7,%xmm0
359	movdqa		%xmm0,0x30(%rsp)
360	pxor		%xmm0,%xmm15
361	pshufb		%xmm2,%xmm15
362
363	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
364	paddd		%xmm12,%xmm8
365	pxor		%xmm8,%xmm4
366	movdqa		%xmm4,%xmm0
367	pslld		$7,%xmm0
368	psrld		$25,%xmm4
369	por		%xmm0,%xmm4
370	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
371	paddd		%xmm13,%xmm9
372	pxor		%xmm9,%xmm5
373	movdqa		%xmm5,%xmm0
374	pslld		$7,%xmm0
375	psrld		$25,%xmm5
376	por		%xmm0,%xmm5
377	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
378	paddd		%xmm14,%xmm10
379	pxor		%xmm10,%xmm6
380	movdqa		%xmm6,%xmm0
381	pslld		$7,%xmm0
382	psrld		$25,%xmm6
383	por		%xmm0,%xmm6
384	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
385	paddd		%xmm15,%xmm11
386	pxor		%xmm11,%xmm7
387	movdqa		%xmm7,%xmm0
388	pslld		$7,%xmm0
389	psrld		$25,%xmm7
390	por		%xmm0,%xmm7
391
392	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
393	movdqa		0x00(%rsp),%xmm0
394	paddd		%xmm5,%xmm0
395	movdqa		%xmm0,0x00(%rsp)
396	pxor		%xmm0,%xmm15
397	pshufb		%xmm3,%xmm15
398	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
399	movdqa		0x10(%rsp),%xmm0
400	paddd		%xmm6,%xmm0
401	movdqa		%xmm0,0x10(%rsp)
402	pxor		%xmm0,%xmm12
403	pshufb		%xmm3,%xmm12
404	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
405	movdqa		0x20(%rsp),%xmm0
406	paddd		%xmm7,%xmm0
407	movdqa		%xmm0,0x20(%rsp)
408	pxor		%xmm0,%xmm13
409	pshufb		%xmm3,%xmm13
410	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
411	movdqa		0x30(%rsp),%xmm0
412	paddd		%xmm4,%xmm0
413	movdqa		%xmm0,0x30(%rsp)
414	pxor		%xmm0,%xmm14
415	pshufb		%xmm3,%xmm14
416
417	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
418	paddd		%xmm15,%xmm10
419	pxor		%xmm10,%xmm5
420	movdqa		%xmm5,%xmm0
421	pslld		$12,%xmm0
422	psrld		$20,%xmm5
423	por		%xmm0,%xmm5
424	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
425	paddd		%xmm12,%xmm11
426	pxor		%xmm11,%xmm6
427	movdqa		%xmm6,%xmm0
428	pslld		$12,%xmm0
429	psrld		$20,%xmm6
430	por		%xmm0,%xmm6
431	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
432	paddd		%xmm13,%xmm8
433	pxor		%xmm8,%xmm7
434	movdqa		%xmm7,%xmm0
435	pslld		$12,%xmm0
436	psrld		$20,%xmm7
437	por		%xmm0,%xmm7
438	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
439	paddd		%xmm14,%xmm9
440	pxor		%xmm9,%xmm4
441	movdqa		%xmm4,%xmm0
442	pslld		$12,%xmm0
443	psrld		$20,%xmm4
444	por		%xmm0,%xmm4
445
446	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
447	movdqa		0x00(%rsp),%xmm0
448	paddd		%xmm5,%xmm0
449	movdqa		%xmm0,0x00(%rsp)
450	pxor		%xmm0,%xmm15
451	pshufb		%xmm2,%xmm15
452	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
453	movdqa		0x10(%rsp),%xmm0
454	paddd		%xmm6,%xmm0
455	movdqa		%xmm0,0x10(%rsp)
456	pxor		%xmm0,%xmm12
457	pshufb		%xmm2,%xmm12
458	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
459	movdqa		0x20(%rsp),%xmm0
460	paddd		%xmm7,%xmm0
461	movdqa		%xmm0,0x20(%rsp)
462	pxor		%xmm0,%xmm13
463	pshufb		%xmm2,%xmm13
464	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
465	movdqa		0x30(%rsp),%xmm0
466	paddd		%xmm4,%xmm0
467	movdqa		%xmm0,0x30(%rsp)
468	pxor		%xmm0,%xmm14
469	pshufb		%xmm2,%xmm14
470
471	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
472	paddd		%xmm15,%xmm10
473	pxor		%xmm10,%xmm5
474	movdqa		%xmm5,%xmm0
475	pslld		$7,%xmm0
476	psrld		$25,%xmm5
477	por		%xmm0,%xmm5
478	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
479	paddd		%xmm12,%xmm11
480	pxor		%xmm11,%xmm6
481	movdqa		%xmm6,%xmm0
482	pslld		$7,%xmm0
483	psrld		$25,%xmm6
484	por		%xmm0,%xmm6
485	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
486	paddd		%xmm13,%xmm8
487	pxor		%xmm8,%xmm7
488	movdqa		%xmm7,%xmm0
489	pslld		$7,%xmm0
490	psrld		$25,%xmm7
491	por		%xmm0,%xmm7
492	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
493	paddd		%xmm14,%xmm9
494	pxor		%xmm9,%xmm4
495	movdqa		%xmm4,%xmm0
496	pslld		$7,%xmm0
497	psrld		$25,%xmm4
498	por		%xmm0,%xmm4
499
500	sub		$2,%r8d
501	jnz		.Ldoubleround4
502
503	# x0[0-3] += s0[0]
504	# x1[0-3] += s0[1]
505	movq		0x00(%rdi),%xmm3
506	pshufd		$0x00,%xmm3,%xmm2
507	pshufd		$0x55,%xmm3,%xmm3
508	paddd		0x00(%rsp),%xmm2
509	movdqa		%xmm2,0x00(%rsp)
510	paddd		0x10(%rsp),%xmm3
511	movdqa		%xmm3,0x10(%rsp)
512	# x2[0-3] += s0[2]
513	# x3[0-3] += s0[3]
514	movq		0x08(%rdi),%xmm3
515	pshufd		$0x00,%xmm3,%xmm2
516	pshufd		$0x55,%xmm3,%xmm3
517	paddd		0x20(%rsp),%xmm2
518	movdqa		%xmm2,0x20(%rsp)
519	paddd		0x30(%rsp),%xmm3
520	movdqa		%xmm3,0x30(%rsp)
521
522	# x4[0-3] += s1[0]
523	# x5[0-3] += s1[1]
524	movq		0x10(%rdi),%xmm3
525	pshufd		$0x00,%xmm3,%xmm2
526	pshufd		$0x55,%xmm3,%xmm3
527	paddd		%xmm2,%xmm4
528	paddd		%xmm3,%xmm5
529	# x6[0-3] += s1[2]
530	# x7[0-3] += s1[3]
531	movq		0x18(%rdi),%xmm3
532	pshufd		$0x00,%xmm3,%xmm2
533	pshufd		$0x55,%xmm3,%xmm3
534	paddd		%xmm2,%xmm6
535	paddd		%xmm3,%xmm7
536
537	# x8[0-3] += s2[0]
538	# x9[0-3] += s2[1]
539	movq		0x20(%rdi),%xmm3
540	pshufd		$0x00,%xmm3,%xmm2
541	pshufd		$0x55,%xmm3,%xmm3
542	paddd		%xmm2,%xmm8
543	paddd		%xmm3,%xmm9
544	# x10[0-3] += s2[2]
545	# x11[0-3] += s2[3]
546	movq		0x28(%rdi),%xmm3
547	pshufd		$0x00,%xmm3,%xmm2
548	pshufd		$0x55,%xmm3,%xmm3
549	paddd		%xmm2,%xmm10
550	paddd		%xmm3,%xmm11
551
552	# x12[0-3] += s3[0]
553	# x13[0-3] += s3[1]
554	movq		0x30(%rdi),%xmm3
555	pshufd		$0x00,%xmm3,%xmm2
556	pshufd		$0x55,%xmm3,%xmm3
557	paddd		%xmm2,%xmm12
558	paddd		%xmm3,%xmm13
559	# x14[0-3] += s3[2]
560	# x15[0-3] += s3[3]
561	movq		0x38(%rdi),%xmm3
562	pshufd		$0x00,%xmm3,%xmm2
563	pshufd		$0x55,%xmm3,%xmm3
564	paddd		%xmm2,%xmm14
565	paddd		%xmm3,%xmm15
566
567	# x12 += counter values 0-3
568	paddd		%xmm1,%xmm12
569
570	# interleave 32-bit words in state n, n+1
571	movdqa		0x00(%rsp),%xmm0
572	movdqa		0x10(%rsp),%xmm1
573	movdqa		%xmm0,%xmm2
574	punpckldq	%xmm1,%xmm2
575	punpckhdq	%xmm1,%xmm0
576	movdqa		%xmm2,0x00(%rsp)
577	movdqa		%xmm0,0x10(%rsp)
578	movdqa		0x20(%rsp),%xmm0
579	movdqa		0x30(%rsp),%xmm1
580	movdqa		%xmm0,%xmm2
581	punpckldq	%xmm1,%xmm2
582	punpckhdq	%xmm1,%xmm0
583	movdqa		%xmm2,0x20(%rsp)
584	movdqa		%xmm0,0x30(%rsp)
585	movdqa		%xmm4,%xmm0
586	punpckldq	%xmm5,%xmm4
587	punpckhdq	%xmm5,%xmm0
588	movdqa		%xmm0,%xmm5
589	movdqa		%xmm6,%xmm0
590	punpckldq	%xmm7,%xmm6
591	punpckhdq	%xmm7,%xmm0
592	movdqa		%xmm0,%xmm7
593	movdqa		%xmm8,%xmm0
594	punpckldq	%xmm9,%xmm8
595	punpckhdq	%xmm9,%xmm0
596	movdqa		%xmm0,%xmm9
597	movdqa		%xmm10,%xmm0
598	punpckldq	%xmm11,%xmm10
599	punpckhdq	%xmm11,%xmm0
600	movdqa		%xmm0,%xmm11
601	movdqa		%xmm12,%xmm0
602	punpckldq	%xmm13,%xmm12
603	punpckhdq	%xmm13,%xmm0
604	movdqa		%xmm0,%xmm13
605	movdqa		%xmm14,%xmm0
606	punpckldq	%xmm15,%xmm14
607	punpckhdq	%xmm15,%xmm0
608	movdqa		%xmm0,%xmm15
609
610	# interleave 64-bit words in state n, n+2
611	movdqa		0x00(%rsp),%xmm0
612	movdqa		0x20(%rsp),%xmm1
613	movdqa		%xmm0,%xmm2
614	punpcklqdq	%xmm1,%xmm2
615	punpckhqdq	%xmm1,%xmm0
616	movdqa		%xmm2,0x00(%rsp)
617	movdqa		%xmm0,0x20(%rsp)
618	movdqa		0x10(%rsp),%xmm0
619	movdqa		0x30(%rsp),%xmm1
620	movdqa		%xmm0,%xmm2
621	punpcklqdq	%xmm1,%xmm2
622	punpckhqdq	%xmm1,%xmm0
623	movdqa		%xmm2,0x10(%rsp)
624	movdqa		%xmm0,0x30(%rsp)
625	movdqa		%xmm4,%xmm0
626	punpcklqdq	%xmm6,%xmm4
627	punpckhqdq	%xmm6,%xmm0
628	movdqa		%xmm0,%xmm6
629	movdqa		%xmm5,%xmm0
630	punpcklqdq	%xmm7,%xmm5
631	punpckhqdq	%xmm7,%xmm0
632	movdqa		%xmm0,%xmm7
633	movdqa		%xmm8,%xmm0
634	punpcklqdq	%xmm10,%xmm8
635	punpckhqdq	%xmm10,%xmm0
636	movdqa		%xmm0,%xmm10
637	movdqa		%xmm9,%xmm0
638	punpcklqdq	%xmm11,%xmm9
639	punpckhqdq	%xmm11,%xmm0
640	movdqa		%xmm0,%xmm11
641	movdqa		%xmm12,%xmm0
642	punpcklqdq	%xmm14,%xmm12
643	punpckhqdq	%xmm14,%xmm0
644	movdqa		%xmm0,%xmm14
645	movdqa		%xmm13,%xmm0
646	punpcklqdq	%xmm15,%xmm13
647	punpckhqdq	%xmm15,%xmm0
648	movdqa		%xmm0,%xmm15
649
650	# xor with corresponding input, write to output
651	movdqa		0x00(%rsp),%xmm0
652	cmp		$0x10,%rax
653	jl		.Lxorpart4
654	movdqu		0x00(%rdx),%xmm1
655	pxor		%xmm1,%xmm0
656	movdqu		%xmm0,0x00(%rsi)
657
658	movdqu		%xmm4,%xmm0
659	cmp		$0x20,%rax
660	jl		.Lxorpart4
661	movdqu		0x10(%rdx),%xmm1
662	pxor		%xmm1,%xmm0
663	movdqu		%xmm0,0x10(%rsi)
664
665	movdqu		%xmm8,%xmm0
666	cmp		$0x30,%rax
667	jl		.Lxorpart4
668	movdqu		0x20(%rdx),%xmm1
669	pxor		%xmm1,%xmm0
670	movdqu		%xmm0,0x20(%rsi)
671
672	movdqu		%xmm12,%xmm0
673	cmp		$0x40,%rax
674	jl		.Lxorpart4
675	movdqu		0x30(%rdx),%xmm1
676	pxor		%xmm1,%xmm0
677	movdqu		%xmm0,0x30(%rsi)
678
679	movdqa		0x20(%rsp),%xmm0
680	cmp		$0x50,%rax
681	jl		.Lxorpart4
682	movdqu		0x40(%rdx),%xmm1
683	pxor		%xmm1,%xmm0
684	movdqu		%xmm0,0x40(%rsi)
685
686	movdqu		%xmm6,%xmm0
687	cmp		$0x60,%rax
688	jl		.Lxorpart4
689	movdqu		0x50(%rdx),%xmm1
690	pxor		%xmm1,%xmm0
691	movdqu		%xmm0,0x50(%rsi)
692
693	movdqu		%xmm10,%xmm0
694	cmp		$0x70,%rax
695	jl		.Lxorpart4
696	movdqu		0x60(%rdx),%xmm1
697	pxor		%xmm1,%xmm0
698	movdqu		%xmm0,0x60(%rsi)
699
700	movdqu		%xmm14,%xmm0
701	cmp		$0x80,%rax
702	jl		.Lxorpart4
703	movdqu		0x70(%rdx),%xmm1
704	pxor		%xmm1,%xmm0
705	movdqu		%xmm0,0x70(%rsi)
706
707	movdqa		0x10(%rsp),%xmm0
708	cmp		$0x90,%rax
709	jl		.Lxorpart4
710	movdqu		0x80(%rdx),%xmm1
711	pxor		%xmm1,%xmm0
712	movdqu		%xmm0,0x80(%rsi)
713
714	movdqu		%xmm5,%xmm0
715	cmp		$0xa0,%rax
716	jl		.Lxorpart4
717	movdqu		0x90(%rdx),%xmm1
718	pxor		%xmm1,%xmm0
719	movdqu		%xmm0,0x90(%rsi)
720
721	movdqu		%xmm9,%xmm0
722	cmp		$0xb0,%rax
723	jl		.Lxorpart4
724	movdqu		0xa0(%rdx),%xmm1
725	pxor		%xmm1,%xmm0
726	movdqu		%xmm0,0xa0(%rsi)
727
728	movdqu		%xmm13,%xmm0
729	cmp		$0xc0,%rax
730	jl		.Lxorpart4
731	movdqu		0xb0(%rdx),%xmm1
732	pxor		%xmm1,%xmm0
733	movdqu		%xmm0,0xb0(%rsi)
734
735	movdqa		0x30(%rsp),%xmm0
736	cmp		$0xd0,%rax
737	jl		.Lxorpart4
738	movdqu		0xc0(%rdx),%xmm1
739	pxor		%xmm1,%xmm0
740	movdqu		%xmm0,0xc0(%rsi)
741
742	movdqu		%xmm7,%xmm0
743	cmp		$0xe0,%rax
744	jl		.Lxorpart4
745	movdqu		0xd0(%rdx),%xmm1
746	pxor		%xmm1,%xmm0
747	movdqu		%xmm0,0xd0(%rsi)
748
749	movdqu		%xmm11,%xmm0
750	cmp		$0xf0,%rax
751	jl		.Lxorpart4
752	movdqu		0xe0(%rdx),%xmm1
753	pxor		%xmm1,%xmm0
754	movdqu		%xmm0,0xe0(%rsi)
755
756	movdqu		%xmm15,%xmm0
757	cmp		$0x100,%rax
758	jl		.Lxorpart4
759	movdqu		0xf0(%rdx),%xmm1
760	pxor		%xmm1,%xmm0
761	movdqu		%xmm0,0xf0(%rsi)
762
763.Ldone4:
764	lea		-8(%r10),%rsp
765	ret
766
767.Lxorpart4:
768	# xor remaining bytes from partial register into output
769	mov		%rax,%r9
770	and		$0x0f,%r9
771	jz		.Ldone4
772	and		$~0x0f,%rax
773
774	mov		%rsi,%r11
775
776	lea		(%rdx,%rax),%rsi
777	mov		%rsp,%rdi
778	mov		%r9,%rcx
779	rep movsb
780
781	pxor		0x00(%rsp),%xmm0
782	movdqa		%xmm0,0x00(%rsp)
783
784	mov		%rsp,%rsi
785	lea		(%r11,%rax),%rdi
786	mov		%r9,%rcx
787	rep movsb
788
789	jmp		.Ldone4
790
791ENDPROC(chacha_4block_xor_ssse3)
792