1d2912cb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2cc9f8349SJackie Liu /*
3cc9f8349SJackie Liu * arch/arm64/lib/xor-neon.c
4cc9f8349SJackie Liu *
5cc9f8349SJackie Liu * Authors: Jackie Liu <liuyun01@kylinos.cn>
6cc9f8349SJackie Liu * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
7cc9f8349SJackie Liu */
8cc9f8349SJackie Liu
9cc9f8349SJackie Liu #include <linux/raid/xor.h>
10cc9f8349SJackie Liu #include <linux/module.h>
11cc9f8349SJackie Liu #include <asm/neon-intrinsics.h>
12cc9f8349SJackie Liu
xor_arm64_neon_2(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2)13*320a93d4SArnd Bergmann static void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
14297565aaSArd Biesheuvel const unsigned long * __restrict p2)
15cc9f8349SJackie Liu {
16cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1;
17cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2;
18cc9f8349SJackie Liu
19cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3;
20cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4);
21cc9f8349SJackie Liu
22cc9f8349SJackie Liu do {
23cc9f8349SJackie Liu /* p1 ^= p2 */
24cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
25cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
26cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
27cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
28cc9f8349SJackie Liu
29cc9f8349SJackie Liu /* store */
30cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0);
31cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1);
32cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2);
33cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3);
34cc9f8349SJackie Liu
35cc9f8349SJackie Liu dp1 += 8;
36cc9f8349SJackie Liu dp2 += 8;
37cc9f8349SJackie Liu } while (--lines > 0);
38cc9f8349SJackie Liu }
39cc9f8349SJackie Liu
xor_arm64_neon_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)40*320a93d4SArnd Bergmann static void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
41297565aaSArd Biesheuvel const unsigned long * __restrict p2,
42297565aaSArd Biesheuvel const unsigned long * __restrict p3)
43cc9f8349SJackie Liu {
44cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1;
45cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2;
46cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3;
47cc9f8349SJackie Liu
48cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3;
49cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4);
50cc9f8349SJackie Liu
51cc9f8349SJackie Liu do {
52cc9f8349SJackie Liu /* p1 ^= p2 */
53cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
54cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
55cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
56cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
57cc9f8349SJackie Liu
58cc9f8349SJackie Liu /* p1 ^= p3 */
59cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
60cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
61cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
62cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
63cc9f8349SJackie Liu
64cc9f8349SJackie Liu /* store */
65cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0);
66cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1);
67cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2);
68cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3);
69cc9f8349SJackie Liu
70cc9f8349SJackie Liu dp1 += 8;
71cc9f8349SJackie Liu dp2 += 8;
72cc9f8349SJackie Liu dp3 += 8;
73cc9f8349SJackie Liu } while (--lines > 0);
74cc9f8349SJackie Liu }
75cc9f8349SJackie Liu
xor_arm64_neon_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)76*320a93d4SArnd Bergmann static void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
77297565aaSArd Biesheuvel const unsigned long * __restrict p2,
78297565aaSArd Biesheuvel const unsigned long * __restrict p3,
79297565aaSArd Biesheuvel const unsigned long * __restrict p4)
80cc9f8349SJackie Liu {
81cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1;
82cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2;
83cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3;
84cc9f8349SJackie Liu uint64_t *dp4 = (uint64_t *)p4;
85cc9f8349SJackie Liu
86cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3;
87cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4);
88cc9f8349SJackie Liu
89cc9f8349SJackie Liu do {
90cc9f8349SJackie Liu /* p1 ^= p2 */
91cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
92cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
93cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
94cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
95cc9f8349SJackie Liu
96cc9f8349SJackie Liu /* p1 ^= p3 */
97cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
98cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
99cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
100cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
101cc9f8349SJackie Liu
102cc9f8349SJackie Liu /* p1 ^= p4 */
103cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
104cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
105cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
106cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
107cc9f8349SJackie Liu
108cc9f8349SJackie Liu /* store */
109cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0);
110cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1);
111cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2);
112cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3);
113cc9f8349SJackie Liu
114cc9f8349SJackie Liu dp1 += 8;
115cc9f8349SJackie Liu dp2 += 8;
116cc9f8349SJackie Liu dp3 += 8;
117cc9f8349SJackie Liu dp4 += 8;
118cc9f8349SJackie Liu } while (--lines > 0);
119cc9f8349SJackie Liu }
120cc9f8349SJackie Liu
xor_arm64_neon_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)121*320a93d4SArnd Bergmann static void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
122297565aaSArd Biesheuvel const unsigned long * __restrict p2,
123297565aaSArd Biesheuvel const unsigned long * __restrict p3,
124297565aaSArd Biesheuvel const unsigned long * __restrict p4,
125297565aaSArd Biesheuvel const unsigned long * __restrict p5)
126cc9f8349SJackie Liu {
127cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1;
128cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2;
129cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3;
130cc9f8349SJackie Liu uint64_t *dp4 = (uint64_t *)p4;
131cc9f8349SJackie Liu uint64_t *dp5 = (uint64_t *)p5;
132cc9f8349SJackie Liu
133cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3;
134cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4);
135cc9f8349SJackie Liu
136cc9f8349SJackie Liu do {
137cc9f8349SJackie Liu /* p1 ^= p2 */
138cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
139cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
140cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
141cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
142cc9f8349SJackie Liu
143cc9f8349SJackie Liu /* p1 ^= p3 */
144cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
145cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
146cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
147cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
148cc9f8349SJackie Liu
149cc9f8349SJackie Liu /* p1 ^= p4 */
150cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
151cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
152cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
153cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
154cc9f8349SJackie Liu
155cc9f8349SJackie Liu /* p1 ^= p5 */
156cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
157cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
158cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
159cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
160cc9f8349SJackie Liu
161cc9f8349SJackie Liu /* store */
162cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0);
163cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1);
164cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2);
165cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3);
166cc9f8349SJackie Liu
167cc9f8349SJackie Liu dp1 += 8;
168cc9f8349SJackie Liu dp2 += 8;
169cc9f8349SJackie Liu dp3 += 8;
170cc9f8349SJackie Liu dp4 += 8;
171cc9f8349SJackie Liu dp5 += 8;
172cc9f8349SJackie Liu } while (--lines > 0);
173cc9f8349SJackie Liu }
174cc9f8349SJackie Liu
1752c54b423SArd Biesheuvel struct xor_block_template xor_block_inner_neon __ro_after_init = {
176cc9f8349SJackie Liu .name = "__inner_neon__",
177cc9f8349SJackie Liu .do_2 = xor_arm64_neon_2,
178cc9f8349SJackie Liu .do_3 = xor_arm64_neon_3,
179cc9f8349SJackie Liu .do_4 = xor_arm64_neon_4,
180cc9f8349SJackie Liu .do_5 = xor_arm64_neon_5,
181cc9f8349SJackie Liu };
182cc9f8349SJackie Liu EXPORT_SYMBOL(xor_block_inner_neon);
183cc9f8349SJackie Liu
eor3(uint64x2_t p,uint64x2_t q,uint64x2_t r)1842c54b423SArd Biesheuvel static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
1852c54b423SArd Biesheuvel {
1862c54b423SArd Biesheuvel uint64x2_t res;
1872c54b423SArd Biesheuvel
1882c54b423SArd Biesheuvel asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
1892c54b423SArd Biesheuvel "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
1902c54b423SArd Biesheuvel : "=w"(res) : "w"(p), "w"(q), "w"(r));
1912c54b423SArd Biesheuvel return res;
1922c54b423SArd Biesheuvel }
1932c54b423SArd Biesheuvel
xor_arm64_eor3_3(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)194297565aaSArd Biesheuvel static void xor_arm64_eor3_3(unsigned long bytes,
195297565aaSArd Biesheuvel unsigned long * __restrict p1,
196297565aaSArd Biesheuvel const unsigned long * __restrict p2,
197297565aaSArd Biesheuvel const unsigned long * __restrict p3)
1982c54b423SArd Biesheuvel {
1992c54b423SArd Biesheuvel uint64_t *dp1 = (uint64_t *)p1;
2002c54b423SArd Biesheuvel uint64_t *dp2 = (uint64_t *)p2;
2012c54b423SArd Biesheuvel uint64_t *dp3 = (uint64_t *)p3;
2022c54b423SArd Biesheuvel
2032c54b423SArd Biesheuvel register uint64x2_t v0, v1, v2, v3;
2042c54b423SArd Biesheuvel long lines = bytes / (sizeof(uint64x2_t) * 4);
2052c54b423SArd Biesheuvel
2062c54b423SArd Biesheuvel do {
2072c54b423SArd Biesheuvel /* p1 ^= p2 ^ p3 */
2082c54b423SArd Biesheuvel v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
2092c54b423SArd Biesheuvel vld1q_u64(dp3 + 0));
2102c54b423SArd Biesheuvel v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
2112c54b423SArd Biesheuvel vld1q_u64(dp3 + 2));
2122c54b423SArd Biesheuvel v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
2132c54b423SArd Biesheuvel vld1q_u64(dp3 + 4));
2142c54b423SArd Biesheuvel v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
2152c54b423SArd Biesheuvel vld1q_u64(dp3 + 6));
2162c54b423SArd Biesheuvel
2172c54b423SArd Biesheuvel /* store */
2182c54b423SArd Biesheuvel vst1q_u64(dp1 + 0, v0);
2192c54b423SArd Biesheuvel vst1q_u64(dp1 + 2, v1);
2202c54b423SArd Biesheuvel vst1q_u64(dp1 + 4, v2);
2212c54b423SArd Biesheuvel vst1q_u64(dp1 + 6, v3);
2222c54b423SArd Biesheuvel
2232c54b423SArd Biesheuvel dp1 += 8;
2242c54b423SArd Biesheuvel dp2 += 8;
2252c54b423SArd Biesheuvel dp3 += 8;
2262c54b423SArd Biesheuvel } while (--lines > 0);
2272c54b423SArd Biesheuvel }
2282c54b423SArd Biesheuvel
xor_arm64_eor3_4(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)229297565aaSArd Biesheuvel static void xor_arm64_eor3_4(unsigned long bytes,
230297565aaSArd Biesheuvel unsigned long * __restrict p1,
231297565aaSArd Biesheuvel const unsigned long * __restrict p2,
232297565aaSArd Biesheuvel const unsigned long * __restrict p3,
233297565aaSArd Biesheuvel const unsigned long * __restrict p4)
2342c54b423SArd Biesheuvel {
2352c54b423SArd Biesheuvel uint64_t *dp1 = (uint64_t *)p1;
2362c54b423SArd Biesheuvel uint64_t *dp2 = (uint64_t *)p2;
2372c54b423SArd Biesheuvel uint64_t *dp3 = (uint64_t *)p3;
2382c54b423SArd Biesheuvel uint64_t *dp4 = (uint64_t *)p4;
2392c54b423SArd Biesheuvel
2402c54b423SArd Biesheuvel register uint64x2_t v0, v1, v2, v3;
2412c54b423SArd Biesheuvel long lines = bytes / (sizeof(uint64x2_t) * 4);
2422c54b423SArd Biesheuvel
2432c54b423SArd Biesheuvel do {
2442c54b423SArd Biesheuvel /* p1 ^= p2 ^ p3 */
2452c54b423SArd Biesheuvel v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
2462c54b423SArd Biesheuvel vld1q_u64(dp3 + 0));
2472c54b423SArd Biesheuvel v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
2482c54b423SArd Biesheuvel vld1q_u64(dp3 + 2));
2492c54b423SArd Biesheuvel v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
2502c54b423SArd Biesheuvel vld1q_u64(dp3 + 4));
2512c54b423SArd Biesheuvel v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
2522c54b423SArd Biesheuvel vld1q_u64(dp3 + 6));
2532c54b423SArd Biesheuvel
2542c54b423SArd Biesheuvel /* p1 ^= p4 */
2552c54b423SArd Biesheuvel v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
2562c54b423SArd Biesheuvel v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
2572c54b423SArd Biesheuvel v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
2582c54b423SArd Biesheuvel v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
2592c54b423SArd Biesheuvel
2602c54b423SArd Biesheuvel /* store */
2612c54b423SArd Biesheuvel vst1q_u64(dp1 + 0, v0);
2622c54b423SArd Biesheuvel vst1q_u64(dp1 + 2, v1);
2632c54b423SArd Biesheuvel vst1q_u64(dp1 + 4, v2);
2642c54b423SArd Biesheuvel vst1q_u64(dp1 + 6, v3);
2652c54b423SArd Biesheuvel
2662c54b423SArd Biesheuvel dp1 += 8;
2672c54b423SArd Biesheuvel dp2 += 8;
2682c54b423SArd Biesheuvel dp3 += 8;
2692c54b423SArd Biesheuvel dp4 += 8;
2702c54b423SArd Biesheuvel } while (--lines > 0);
2712c54b423SArd Biesheuvel }
2722c54b423SArd Biesheuvel
xor_arm64_eor3_5(unsigned long bytes,unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4,const unsigned long * __restrict p5)273297565aaSArd Biesheuvel static void xor_arm64_eor3_5(unsigned long bytes,
274297565aaSArd Biesheuvel unsigned long * __restrict p1,
275297565aaSArd Biesheuvel const unsigned long * __restrict p2,
276297565aaSArd Biesheuvel const unsigned long * __restrict p3,
277297565aaSArd Biesheuvel const unsigned long * __restrict p4,
278297565aaSArd Biesheuvel const unsigned long * __restrict p5)
2792c54b423SArd Biesheuvel {
2802c54b423SArd Biesheuvel uint64_t *dp1 = (uint64_t *)p1;
2812c54b423SArd Biesheuvel uint64_t *dp2 = (uint64_t *)p2;
2822c54b423SArd Biesheuvel uint64_t *dp3 = (uint64_t *)p3;
2832c54b423SArd Biesheuvel uint64_t *dp4 = (uint64_t *)p4;
2842c54b423SArd Biesheuvel uint64_t *dp5 = (uint64_t *)p5;
2852c54b423SArd Biesheuvel
2862c54b423SArd Biesheuvel register uint64x2_t v0, v1, v2, v3;
2872c54b423SArd Biesheuvel long lines = bytes / (sizeof(uint64x2_t) * 4);
2882c54b423SArd Biesheuvel
2892c54b423SArd Biesheuvel do {
2902c54b423SArd Biesheuvel /* p1 ^= p2 ^ p3 */
2912c54b423SArd Biesheuvel v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
2922c54b423SArd Biesheuvel vld1q_u64(dp3 + 0));
2932c54b423SArd Biesheuvel v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
2942c54b423SArd Biesheuvel vld1q_u64(dp3 + 2));
2952c54b423SArd Biesheuvel v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
2962c54b423SArd Biesheuvel vld1q_u64(dp3 + 4));
2972c54b423SArd Biesheuvel v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
2982c54b423SArd Biesheuvel vld1q_u64(dp3 + 6));
2992c54b423SArd Biesheuvel
3002c54b423SArd Biesheuvel /* p1 ^= p4 ^ p5 */
3012c54b423SArd Biesheuvel v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
3022c54b423SArd Biesheuvel v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
3032c54b423SArd Biesheuvel v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
3042c54b423SArd Biesheuvel v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
3052c54b423SArd Biesheuvel
3062c54b423SArd Biesheuvel /* store */
3072c54b423SArd Biesheuvel vst1q_u64(dp1 + 0, v0);
3082c54b423SArd Biesheuvel vst1q_u64(dp1 + 2, v1);
3092c54b423SArd Biesheuvel vst1q_u64(dp1 + 4, v2);
3102c54b423SArd Biesheuvel vst1q_u64(dp1 + 6, v3);
3112c54b423SArd Biesheuvel
3122c54b423SArd Biesheuvel dp1 += 8;
3132c54b423SArd Biesheuvel dp2 += 8;
3142c54b423SArd Biesheuvel dp3 += 8;
3152c54b423SArd Biesheuvel dp4 += 8;
3162c54b423SArd Biesheuvel dp5 += 8;
3172c54b423SArd Biesheuvel } while (--lines > 0);
3182c54b423SArd Biesheuvel }
3192c54b423SArd Biesheuvel
xor_neon_init(void)3202c54b423SArd Biesheuvel static int __init xor_neon_init(void)
3212c54b423SArd Biesheuvel {
3222c54b423SArd Biesheuvel if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
3232c54b423SArd Biesheuvel xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
3242c54b423SArd Biesheuvel xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
3252c54b423SArd Biesheuvel xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
3262c54b423SArd Biesheuvel }
3272c54b423SArd Biesheuvel return 0;
3282c54b423SArd Biesheuvel }
3292c54b423SArd Biesheuvel module_init(xor_neon_init);
3302c54b423SArd Biesheuvel
xor_neon_exit(void)3312c54b423SArd Biesheuvel static void __exit xor_neon_exit(void)
3322c54b423SArd Biesheuvel {
3332c54b423SArd Biesheuvel }
3342c54b423SArd Biesheuvel module_exit(xor_neon_exit);
3352c54b423SArd Biesheuvel
336cc9f8349SJackie Liu MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
337cc9f8349SJackie Liu MODULE_DESCRIPTION("ARMv8 XOR Extensions");
338cc9f8349SJackie Liu MODULE_LICENSE("GPL");
339