xref: /openbmc/linux/arch/arm64/lib/xor-neon.c (revision 2612e3bbc0386368a850140a6c9b990cd496a5ec)
1d2912cb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2cc9f8349SJackie Liu /*
3cc9f8349SJackie Liu  * arch/arm64/lib/xor-neon.c
4cc9f8349SJackie Liu  *
5cc9f8349SJackie Liu  * Authors: Jackie Liu <liuyun01@kylinos.cn>
6cc9f8349SJackie Liu  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
7cc9f8349SJackie Liu  */
8cc9f8349SJackie Liu 
9cc9f8349SJackie Liu #include <linux/raid/xor.h>
10cc9f8349SJackie Liu #include <linux/module.h>
11cc9f8349SJackie Liu #include <asm/neon-intrinsics.h>
12cc9f8349SJackie Liu 
xor_arm64_neon_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)13*320a93d4SArnd Bergmann static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
14297565aaSArd Biesheuvel 	const unsigned long * __restrict p2)
15cc9f8349SJackie Liu {
16cc9f8349SJackie Liu 	uint64_t *dp1 = (uint64_t *)p1;
17cc9f8349SJackie Liu 	uint64_t *dp2 = (uint64_t *)p2;
18cc9f8349SJackie Liu 
19cc9f8349SJackie Liu 	register uint64x2_t v0, v1, v2, v3;
20cc9f8349SJackie Liu 	long lines = bytes / (sizeof(uint64x2_t) * 4);
21cc9f8349SJackie Liu 
22cc9f8349SJackie Liu 	do {
23cc9f8349SJackie Liu 		/* p1 ^= p2 */
24cc9f8349SJackie Liu 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
25cc9f8349SJackie Liu 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
26cc9f8349SJackie Liu 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
27cc9f8349SJackie Liu 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
28cc9f8349SJackie Liu 
29cc9f8349SJackie Liu 		/* store */
30cc9f8349SJackie Liu 		vst1q_u64(dp1 +  0, v0);
31cc9f8349SJackie Liu 		vst1q_u64(dp1 +  2, v1);
32cc9f8349SJackie Liu 		vst1q_u64(dp1 +  4, v2);
33cc9f8349SJackie Liu 		vst1q_u64(dp1 +  6, v3);
34cc9f8349SJackie Liu 
35cc9f8349SJackie Liu 		dp1 += 8;
36cc9f8349SJackie Liu 		dp2 += 8;
37cc9f8349SJackie Liu 	} while (--lines > 0);
38cc9f8349SJackie Liu }
39cc9f8349SJackie Liu 
xor_arm64_neon_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)40*320a93d4SArnd Bergmann static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
41297565aaSArd Biesheuvel 	const unsigned long * __restrict p2,
42297565aaSArd Biesheuvel 	const unsigned long * __restrict p3)
43cc9f8349SJackie Liu {
44cc9f8349SJackie Liu 	uint64_t *dp1 = (uint64_t *)p1;
45cc9f8349SJackie Liu 	uint64_t *dp2 = (uint64_t *)p2;
46cc9f8349SJackie Liu 	uint64_t *dp3 = (uint64_t *)p3;
47cc9f8349SJackie Liu 
48cc9f8349SJackie Liu 	register uint64x2_t v0, v1, v2, v3;
49cc9f8349SJackie Liu 	long lines = bytes / (sizeof(uint64x2_t) * 4);
50cc9f8349SJackie Liu 
51cc9f8349SJackie Liu 	do {
52cc9f8349SJackie Liu 		/* p1 ^= p2 */
53cc9f8349SJackie Liu 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
54cc9f8349SJackie Liu 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
55cc9f8349SJackie Liu 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
56cc9f8349SJackie Liu 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
57cc9f8349SJackie Liu 
58cc9f8349SJackie Liu 		/* p1 ^= p3 */
59cc9f8349SJackie Liu 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
60cc9f8349SJackie Liu 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
61cc9f8349SJackie Liu 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
62cc9f8349SJackie Liu 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
63cc9f8349SJackie Liu 
64cc9f8349SJackie Liu 		/* store */
65cc9f8349SJackie Liu 		vst1q_u64(dp1 +  0, v0);
66cc9f8349SJackie Liu 		vst1q_u64(dp1 +  2, v1);
67cc9f8349SJackie Liu 		vst1q_u64(dp1 +  4, v2);
68cc9f8349SJackie Liu 		vst1q_u64(dp1 +  6, v3);
69cc9f8349SJackie Liu 
70cc9f8349SJackie Liu 		dp1 += 8;
71cc9f8349SJackie Liu 		dp2 += 8;
72cc9f8349SJackie Liu 		dp3 += 8;
73cc9f8349SJackie Liu 	} while (--lines > 0);
74cc9f8349SJackie Liu }
75cc9f8349SJackie Liu 
xor_arm64_neon_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)76*320a93d4SArnd Bergmann static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
77297565aaSArd Biesheuvel 	const unsigned long * __restrict p2,
78297565aaSArd Biesheuvel 	const unsigned long * __restrict p3,
79297565aaSArd Biesheuvel 	const unsigned long * __restrict p4)
80cc9f8349SJackie Liu {
81cc9f8349SJackie Liu 	uint64_t *dp1 = (uint64_t *)p1;
82cc9f8349SJackie Liu 	uint64_t *dp2 = (uint64_t *)p2;
83cc9f8349SJackie Liu 	uint64_t *dp3 = (uint64_t *)p3;
84cc9f8349SJackie Liu 	uint64_t *dp4 = (uint64_t *)p4;
85cc9f8349SJackie Liu 
86cc9f8349SJackie Liu 	register uint64x2_t v0, v1, v2, v3;
87cc9f8349SJackie Liu 	long lines = bytes / (sizeof(uint64x2_t) * 4);
88cc9f8349SJackie Liu 
89cc9f8349SJackie Liu 	do {
90cc9f8349SJackie Liu 		/* p1 ^= p2 */
91cc9f8349SJackie Liu 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
92cc9f8349SJackie Liu 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
93cc9f8349SJackie Liu 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
94cc9f8349SJackie Liu 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
95cc9f8349SJackie Liu 
96cc9f8349SJackie Liu 		/* p1 ^= p3 */
97cc9f8349SJackie Liu 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
98cc9f8349SJackie Liu 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
99cc9f8349SJackie Liu 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
100cc9f8349SJackie Liu 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
101cc9f8349SJackie Liu 
102cc9f8349SJackie Liu 		/* p1 ^= p4 */
103cc9f8349SJackie Liu 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
104cc9f8349SJackie Liu 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
105cc9f8349SJackie Liu 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
106cc9f8349SJackie Liu 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
107cc9f8349SJackie Liu 
108cc9f8349SJackie Liu 		/* store */
109cc9f8349SJackie Liu 		vst1q_u64(dp1 +  0, v0);
110cc9f8349SJackie Liu 		vst1q_u64(dp1 +  2, v1);
111cc9f8349SJackie Liu 		vst1q_u64(dp1 +  4, v2);
112cc9f8349SJackie Liu 		vst1q_u64(dp1 +  6, v3);
113cc9f8349SJackie Liu 
114cc9f8349SJackie Liu 		dp1 += 8;
115cc9f8349SJackie Liu 		dp2 += 8;
116cc9f8349SJackie Liu 		dp3 += 8;
117cc9f8349SJackie Liu 		dp4 += 8;
118cc9f8349SJackie Liu 	} while (--lines > 0);
119cc9f8349SJackie Liu }
120cc9f8349SJackie Liu 
xor_arm64_neon_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)121*320a93d4SArnd Bergmann static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
122297565aaSArd Biesheuvel 	const unsigned long * __restrict p2,
123297565aaSArd Biesheuvel 	const unsigned long * __restrict p3,
124297565aaSArd Biesheuvel 	const unsigned long * __restrict p4,
125297565aaSArd Biesheuvel 	const unsigned long * __restrict p5)
126cc9f8349SJackie Liu {
127cc9f8349SJackie Liu 	uint64_t *dp1 = (uint64_t *)p1;
128cc9f8349SJackie Liu 	uint64_t *dp2 = (uint64_t *)p2;
129cc9f8349SJackie Liu 	uint64_t *dp3 = (uint64_t *)p3;
130cc9f8349SJackie Liu 	uint64_t *dp4 = (uint64_t *)p4;
131cc9f8349SJackie Liu 	uint64_t *dp5 = (uint64_t *)p5;
132cc9f8349SJackie Liu 
133cc9f8349SJackie Liu 	register uint64x2_t v0, v1, v2, v3;
134cc9f8349SJackie Liu 	long lines = bytes / (sizeof(uint64x2_t) * 4);
135cc9f8349SJackie Liu 
136cc9f8349SJackie Liu 	do {
137cc9f8349SJackie Liu 		/* p1 ^= p2 */
138cc9f8349SJackie Liu 		v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
139cc9f8349SJackie Liu 		v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
140cc9f8349SJackie Liu 		v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
141cc9f8349SJackie Liu 		v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
142cc9f8349SJackie Liu 
143cc9f8349SJackie Liu 		/* p1 ^= p3 */
144cc9f8349SJackie Liu 		v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
145cc9f8349SJackie Liu 		v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
146cc9f8349SJackie Liu 		v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
147cc9f8349SJackie Liu 		v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
148cc9f8349SJackie Liu 
149cc9f8349SJackie Liu 		/* p1 ^= p4 */
150cc9f8349SJackie Liu 		v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
151cc9f8349SJackie Liu 		v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
152cc9f8349SJackie Liu 		v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
153cc9f8349SJackie Liu 		v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
154cc9f8349SJackie Liu 
155cc9f8349SJackie Liu 		/* p1 ^= p5 */
156cc9f8349SJackie Liu 		v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
157cc9f8349SJackie Liu 		v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
158cc9f8349SJackie Liu 		v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
159cc9f8349SJackie Liu 		v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
160cc9f8349SJackie Liu 
161cc9f8349SJackie Liu 		/* store */
162cc9f8349SJackie Liu 		vst1q_u64(dp1 +  0, v0);
163cc9f8349SJackie Liu 		vst1q_u64(dp1 +  2, v1);
164cc9f8349SJackie Liu 		vst1q_u64(dp1 +  4, v2);
165cc9f8349SJackie Liu 		vst1q_u64(dp1 +  6, v3);
166cc9f8349SJackie Liu 
167cc9f8349SJackie Liu 		dp1 += 8;
168cc9f8349SJackie Liu 		dp2 += 8;
169cc9f8349SJackie Liu 		dp3 += 8;
170cc9f8349SJackie Liu 		dp4 += 8;
171cc9f8349SJackie Liu 		dp5 += 8;
172cc9f8349SJackie Liu 	} while (--lines > 0);
173cc9f8349SJackie Liu }
174cc9f8349SJackie Liu 
1752c54b423SArd Biesheuvel struct xor_block_template xor_block_inner_neon __ro_after_init = {
176cc9f8349SJackie Liu 	.name	= "__inner_neon__",
177cc9f8349SJackie Liu 	.do_2	= xor_arm64_neon_2,
178cc9f8349SJackie Liu 	.do_3	= xor_arm64_neon_3,
179cc9f8349SJackie Liu 	.do_4	= xor_arm64_neon_4,
180cc9f8349SJackie Liu 	.do_5	= xor_arm64_neon_5,
181cc9f8349SJackie Liu };
182cc9f8349SJackie Liu EXPORT_SYMBOL(xor_block_inner_neon);
183cc9f8349SJackie Liu 
eor3(uint64x2_t p,uint64x2_t q,uint64x2_t r)1842c54b423SArd Biesheuvel static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
1852c54b423SArd Biesheuvel {
1862c54b423SArd Biesheuvel 	uint64x2_t res;
1872c54b423SArd Biesheuvel 
1882c54b423SArd Biesheuvel 	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
1892c54b423SArd Biesheuvel 	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
1902c54b423SArd Biesheuvel 	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
1912c54b423SArd Biesheuvel 	return res;
1922c54b423SArd Biesheuvel }
1932c54b423SArd Biesheuvel 
xor_arm64_eor3_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)194297565aaSArd Biesheuvel static void xor_arm64_eor3_3(unsigned long bytes,
195297565aaSArd Biesheuvel 	unsigned long * __restrict p1,
196297565aaSArd Biesheuvel 	const unsigned long * __restrict p2,
197297565aaSArd Biesheuvel 	const unsigned long * __restrict p3)
1982c54b423SArd Biesheuvel {
1992c54b423SArd Biesheuvel 	uint64_t *dp1 = (uint64_t *)p1;
2002c54b423SArd Biesheuvel 	uint64_t *dp2 = (uint64_t *)p2;
2012c54b423SArd Biesheuvel 	uint64_t *dp3 = (uint64_t *)p3;
2022c54b423SArd Biesheuvel 
2032c54b423SArd Biesheuvel 	register uint64x2_t v0, v1, v2, v3;
2042c54b423SArd Biesheuvel 	long lines = bytes / (sizeof(uint64x2_t) * 4);
2052c54b423SArd Biesheuvel 
2062c54b423SArd Biesheuvel 	do {
2072c54b423SArd Biesheuvel 		/* p1 ^= p2 ^ p3 */
2082c54b423SArd Biesheuvel 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
2092c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 0));
2102c54b423SArd Biesheuvel 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
2112c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 2));
2122c54b423SArd Biesheuvel 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
2132c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 4));
2142c54b423SArd Biesheuvel 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
2152c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 6));
2162c54b423SArd Biesheuvel 
2172c54b423SArd Biesheuvel 		/* store */
2182c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 0, v0);
2192c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 2, v1);
2202c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 4, v2);
2212c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 6, v3);
2222c54b423SArd Biesheuvel 
2232c54b423SArd Biesheuvel 		dp1 += 8;
2242c54b423SArd Biesheuvel 		dp2 += 8;
2252c54b423SArd Biesheuvel 		dp3 += 8;
2262c54b423SArd Biesheuvel 	} while (--lines > 0);
2272c54b423SArd Biesheuvel }
2282c54b423SArd Biesheuvel 
xor_arm64_eor3_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)229297565aaSArd Biesheuvel static void xor_arm64_eor3_4(unsigned long bytes,
230297565aaSArd Biesheuvel 	unsigned long * __restrict p1,
231297565aaSArd Biesheuvel 	const unsigned long * __restrict p2,
232297565aaSArd Biesheuvel 	const unsigned long * __restrict p3,
233297565aaSArd Biesheuvel 	const unsigned long * __restrict p4)
2342c54b423SArd Biesheuvel {
2352c54b423SArd Biesheuvel 	uint64_t *dp1 = (uint64_t *)p1;
2362c54b423SArd Biesheuvel 	uint64_t *dp2 = (uint64_t *)p2;
2372c54b423SArd Biesheuvel 	uint64_t *dp3 = (uint64_t *)p3;
2382c54b423SArd Biesheuvel 	uint64_t *dp4 = (uint64_t *)p4;
2392c54b423SArd Biesheuvel 
2402c54b423SArd Biesheuvel 	register uint64x2_t v0, v1, v2, v3;
2412c54b423SArd Biesheuvel 	long lines = bytes / (sizeof(uint64x2_t) * 4);
2422c54b423SArd Biesheuvel 
2432c54b423SArd Biesheuvel 	do {
2442c54b423SArd Biesheuvel 		/* p1 ^= p2 ^ p3 */
2452c54b423SArd Biesheuvel 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
2462c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 0));
2472c54b423SArd Biesheuvel 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
2482c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 2));
2492c54b423SArd Biesheuvel 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
2502c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 4));
2512c54b423SArd Biesheuvel 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
2522c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 6));
2532c54b423SArd Biesheuvel 
2542c54b423SArd Biesheuvel 		/* p1 ^= p4 */
2552c54b423SArd Biesheuvel 		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
2562c54b423SArd Biesheuvel 		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
2572c54b423SArd Biesheuvel 		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
2582c54b423SArd Biesheuvel 		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
2592c54b423SArd Biesheuvel 
2602c54b423SArd Biesheuvel 		/* store */
2612c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 0, v0);
2622c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 2, v1);
2632c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 4, v2);
2642c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 6, v3);
2652c54b423SArd Biesheuvel 
2662c54b423SArd Biesheuvel 		dp1 += 8;
2672c54b423SArd Biesheuvel 		dp2 += 8;
2682c54b423SArd Biesheuvel 		dp3 += 8;
2692c54b423SArd Biesheuvel 		dp4 += 8;
2702c54b423SArd Biesheuvel 	} while (--lines > 0);
2712c54b423SArd Biesheuvel }
2722c54b423SArd Biesheuvel 
xor_arm64_eor3_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)273297565aaSArd Biesheuvel static void xor_arm64_eor3_5(unsigned long bytes,
274297565aaSArd Biesheuvel 	unsigned long * __restrict p1,
275297565aaSArd Biesheuvel 	const unsigned long * __restrict p2,
276297565aaSArd Biesheuvel 	const unsigned long * __restrict p3,
277297565aaSArd Biesheuvel 	const unsigned long * __restrict p4,
278297565aaSArd Biesheuvel 	const unsigned long * __restrict p5)
2792c54b423SArd Biesheuvel {
2802c54b423SArd Biesheuvel 	uint64_t *dp1 = (uint64_t *)p1;
2812c54b423SArd Biesheuvel 	uint64_t *dp2 = (uint64_t *)p2;
2822c54b423SArd Biesheuvel 	uint64_t *dp3 = (uint64_t *)p3;
2832c54b423SArd Biesheuvel 	uint64_t *dp4 = (uint64_t *)p4;
2842c54b423SArd Biesheuvel 	uint64_t *dp5 = (uint64_t *)p5;
2852c54b423SArd Biesheuvel 
2862c54b423SArd Biesheuvel 	register uint64x2_t v0, v1, v2, v3;
2872c54b423SArd Biesheuvel 	long lines = bytes / (sizeof(uint64x2_t) * 4);
2882c54b423SArd Biesheuvel 
2892c54b423SArd Biesheuvel 	do {
2902c54b423SArd Biesheuvel 		/* p1 ^= p2 ^ p3 */
2912c54b423SArd Biesheuvel 		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
2922c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 0));
2932c54b423SArd Biesheuvel 		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
2942c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 2));
2952c54b423SArd Biesheuvel 		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
2962c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 4));
2972c54b423SArd Biesheuvel 		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
2982c54b423SArd Biesheuvel 			  vld1q_u64(dp3 + 6));
2992c54b423SArd Biesheuvel 
3002c54b423SArd Biesheuvel 		/* p1 ^= p4 ^ p5 */
3012c54b423SArd Biesheuvel 		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
3022c54b423SArd Biesheuvel 		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
3032c54b423SArd Biesheuvel 		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
3042c54b423SArd Biesheuvel 		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
3052c54b423SArd Biesheuvel 
3062c54b423SArd Biesheuvel 		/* store */
3072c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 0, v0);
3082c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 2, v1);
3092c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 4, v2);
3102c54b423SArd Biesheuvel 		vst1q_u64(dp1 + 6, v3);
3112c54b423SArd Biesheuvel 
3122c54b423SArd Biesheuvel 		dp1 += 8;
3132c54b423SArd Biesheuvel 		dp2 += 8;
3142c54b423SArd Biesheuvel 		dp3 += 8;
3152c54b423SArd Biesheuvel 		dp4 += 8;
3162c54b423SArd Biesheuvel 		dp5 += 8;
3172c54b423SArd Biesheuvel 	} while (--lines > 0);
3182c54b423SArd Biesheuvel }
3192c54b423SArd Biesheuvel 
xor_neon_init(void)3202c54b423SArd Biesheuvel static int __init xor_neon_init(void)
3212c54b423SArd Biesheuvel {
3222c54b423SArd Biesheuvel 	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
3232c54b423SArd Biesheuvel 		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
3242c54b423SArd Biesheuvel 		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
3252c54b423SArd Biesheuvel 		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
3262c54b423SArd Biesheuvel 	}
3272c54b423SArd Biesheuvel 	return 0;
3282c54b423SArd Biesheuvel }
3292c54b423SArd Biesheuvel module_init(xor_neon_init);
3302c54b423SArd Biesheuvel 
xor_neon_exit(void)3312c54b423SArd Biesheuvel static void __exit xor_neon_exit(void)
3322c54b423SArd Biesheuvel {
3332c54b423SArd Biesheuvel }
3342c54b423SArd Biesheuvel module_exit(xor_neon_exit);
3352c54b423SArd Biesheuvel 
336cc9f8349SJackie Liu MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
337cc9f8349SJackie Liu MODULE_DESCRIPTION("ARMv8 XOR Extensions");
338cc9f8349SJackie Liu MODULE_LICENSE("GPL");
339