xref: /openbmc/linux/lib/raid6/recov_neon_inner.c (revision 2e7c04aec86758e0adfcad4a24c86593b45807a3)
1 /*
2  * Copyright (C) 2012 Intel Corporation
3  * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License
7  * as published by the Free Software Foundation; version 2
8  * of the License.
9  */
10 
11 #include <arm_neon.h>
12 
13 static const uint8x16_t x0f = {
14 	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
15 	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
16 };
17 
18 #ifdef CONFIG_ARM
19 /*
20  * AArch32 does not provide this intrinsic natively because it does not
21  * implement the underlying instruction. AArch32 only provides a 64-bit
22  * wide vtbl.8 instruction, so use that instead.
23  */
24 static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
25 {
26 	union {
27 		uint8x16_t	val;
28 		uint8x8x2_t	pair;
29 	} __a = { a };
30 
31 	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
32 			   vtbl2_u8(__a.pair, vget_high_u8(b)));
33 }
34 #endif
35 
36 void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
37 			      uint8_t *dq, const uint8_t *pbmul,
38 			      const uint8_t *qmul)
39 {
40 	uint8x16_t pm0 = vld1q_u8(pbmul);
41 	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
42 	uint8x16_t qm0 = vld1q_u8(qmul);
43 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
44 
45 	/*
46 	 * while ( bytes-- ) {
47 	 *	uint8_t px, qx, db;
48 	 *
49 	 *	px    = *p ^ *dp;
50 	 *	qx    = qmul[*q ^ *dq];
51 	 *	*dq++ = db = pbmul[px] ^ qx;
52 	 *	*dp++ = db ^ px;
53 	 *	p++; q++;
54 	 * }
55 	 */
56 
57 	while (bytes) {
58 		uint8x16_t vx, vy, px, qx, db;
59 
60 		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
61 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
62 
63 		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
64 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
65 		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
66 		qx = veorq_u8(vx, vy);
67 
68 		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
69 		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
70 		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
71 		vx = veorq_u8(vx, vy);
72 		db = veorq_u8(vx, qx);
73 
74 		vst1q_u8(dq, db);
75 		vst1q_u8(dp, veorq_u8(db, px));
76 
77 		bytes -= 16;
78 		p += 16;
79 		q += 16;
80 		dp += 16;
81 		dq += 16;
82 	}
83 }
84 
85 void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
86 			      const uint8_t *qmul)
87 {
88 	uint8x16_t qm0 = vld1q_u8(qmul);
89 	uint8x16_t qm1 = vld1q_u8(qmul + 16);
90 
91 	/*
92 	 * while (bytes--) {
93 	 *	*p++ ^= *dq = qmul[*q ^ *dq];
94 	 *	q++; dq++;
95 	 * }
96 	 */
97 
98 	while (bytes) {
99 		uint8x16_t vx, vy;
100 
101 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
102 
103 		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
104 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
105 		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
106 		vx = veorq_u8(vx, vy);
107 		vy = veorq_u8(vx, vld1q_u8(p));
108 
109 		vst1q_u8(dq, vx);
110 		vst1q_u8(p, vy);
111 
112 		bytes -= 16;
113 		p += 16;
114 		q += 16;
115 		dq += 16;
116 	}
117 }
118