1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * arch/arm64/lib/xor-neon.c 4 * 5 * Authors: Jackie Liu <liuyun01@kylinos.cn> 6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 7 */ 8 9 #include <linux/raid/xor.h> 10 #include <linux/module.h> 11 #include <asm/neon-intrinsics.h> 12 13 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, 14 unsigned long *p2) 15 { 16 uint64_t *dp1 = (uint64_t *)p1; 17 uint64_t *dp2 = (uint64_t *)p2; 18 19 register uint64x2_t v0, v1, v2, v3; 20 long lines = bytes / (sizeof(uint64x2_t) * 4); 21 22 do { 23 /* p1 ^= p2 */ 24 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 25 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 26 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 27 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 28 29 /* store */ 30 vst1q_u64(dp1 + 0, v0); 31 vst1q_u64(dp1 + 2, v1); 32 vst1q_u64(dp1 + 4, v2); 33 vst1q_u64(dp1 + 6, v3); 34 35 dp1 += 8; 36 dp2 += 8; 37 } while (--lines > 0); 38 } 39 40 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, 41 unsigned long *p2, unsigned long *p3) 42 { 43 uint64_t *dp1 = (uint64_t *)p1; 44 uint64_t *dp2 = (uint64_t *)p2; 45 uint64_t *dp3 = (uint64_t *)p3; 46 47 register uint64x2_t v0, v1, v2, v3; 48 long lines = bytes / (sizeof(uint64x2_t) * 4); 49 50 do { 51 /* p1 ^= p2 */ 52 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 53 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 54 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 55 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 56 57 /* p1 ^= p3 */ 58 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 59 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 60 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 61 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 62 63 /* store */ 64 vst1q_u64(dp1 + 0, v0); 65 vst1q_u64(dp1 + 2, v1); 66 vst1q_u64(dp1 + 4, v2); 67 vst1q_u64(dp1 + 6, v3); 68 69 dp1 += 8; 70 dp2 += 8; 71 dp3 += 8; 72 } while (--lines > 0); 73 } 74 75 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, 76 unsigned long *p2, unsigned long *p3, unsigned long *p4) 77 { 78 uint64_t *dp1 = (uint64_t *)p1; 79 uint64_t *dp2 = (uint64_t *)p2; 80 uint64_t *dp3 = (uint64_t *)p3; 81 uint64_t *dp4 = (uint64_t *)p4; 82 83 register uint64x2_t v0, v1, v2, v3; 84 long lines = bytes / (sizeof(uint64x2_t) * 4); 85 86 do { 87 /* p1 ^= p2 */ 88 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 89 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 90 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 91 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 92 93 /* p1 ^= p3 */ 94 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 95 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 96 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 97 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 98 99 /* p1 ^= p4 */ 100 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 101 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 102 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 103 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 104 105 /* store */ 106 vst1q_u64(dp1 + 0, v0); 107 vst1q_u64(dp1 + 2, v1); 108 vst1q_u64(dp1 + 4, v2); 109 vst1q_u64(dp1 + 6, v3); 110 111 dp1 += 8; 112 dp2 += 8; 113 dp3 += 8; 114 dp4 += 8; 115 } while (--lines > 0); 116 } 117 118 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, 119 unsigned long *p2, unsigned long *p3, 120 unsigned long *p4, unsigned long *p5) 121 { 122 uint64_t *dp1 = (uint64_t *)p1; 123 uint64_t *dp2 = (uint64_t *)p2; 124 uint64_t *dp3 = (uint64_t *)p3; 125 uint64_t *dp4 = (uint64_t *)p4; 126 uint64_t *dp5 = (uint64_t *)p5; 127 128 register uint64x2_t v0, v1, v2, v3; 129 long lines = bytes / (sizeof(uint64x2_t) * 4); 130 131 do { 132 /* p1 ^= p2 */ 133 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 134 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 135 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 136 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 137 138 /* p1 ^= p3 */ 139 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 140 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 141 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 142 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 143 144 /* p1 ^= p4 */ 145 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 146 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 147 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 148 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 149 150 /* p1 ^= p5 */ 151 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 152 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 153 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 154 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 155 156 /* store */ 157 vst1q_u64(dp1 + 0, v0); 158 vst1q_u64(dp1 + 2, v1); 159 vst1q_u64(dp1 + 4, v2); 160 vst1q_u64(dp1 + 6, v3); 161 162 dp1 += 8; 163 dp2 += 8; 164 dp3 += 8; 165 dp4 += 8; 166 dp5 += 8; 167 } while (--lines > 0); 168 } 169 170 struct xor_block_template xor_block_inner_neon __ro_after_init = { 171 .name = "__inner_neon__", 172 .do_2 = xor_arm64_neon_2, 173 .do_3 = xor_arm64_neon_3, 174 .do_4 = xor_arm64_neon_4, 175 .do_5 = xor_arm64_neon_5, 176 }; 177 EXPORT_SYMBOL(xor_block_inner_neon); 178 179 static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r) 180 { 181 uint64x2_t res; 182 183 asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n" 184 "eor3 %0.16b, %1.16b, %2.16b, %3.16b" 185 : "=w"(res) : "w"(p), "w"(q), "w"(r)); 186 return res; 187 } 188 189 static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1, 190 unsigned long *p2, unsigned long *p3) 191 { 192 uint64_t *dp1 = (uint64_t *)p1; 193 uint64_t *dp2 = (uint64_t *)p2; 194 uint64_t *dp3 = (uint64_t *)p3; 195 196 register uint64x2_t v0, v1, v2, v3; 197 long lines = bytes / (sizeof(uint64x2_t) * 4); 198 199 do { 200 /* p1 ^= p2 ^ p3 */ 201 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 202 vld1q_u64(dp3 + 0)); 203 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 204 vld1q_u64(dp3 + 2)); 205 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 206 vld1q_u64(dp3 + 4)); 207 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 208 vld1q_u64(dp3 + 6)); 209 210 /* store */ 211 vst1q_u64(dp1 + 0, v0); 212 vst1q_u64(dp1 + 2, v1); 213 vst1q_u64(dp1 + 4, v2); 214 vst1q_u64(dp1 + 6, v3); 215 216 dp1 += 8; 217 dp2 += 8; 218 dp3 += 8; 219 } while (--lines > 0); 220 } 221 222 static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1, 223 unsigned long *p2, unsigned long *p3, 224 unsigned long *p4) 225 { 226 uint64_t *dp1 = (uint64_t *)p1; 227 uint64_t *dp2 = (uint64_t *)p2; 228 uint64_t *dp3 = (uint64_t *)p3; 229 uint64_t *dp4 = (uint64_t *)p4; 230 231 register uint64x2_t v0, v1, v2, v3; 232 long lines = bytes / (sizeof(uint64x2_t) * 4); 233 234 do { 235 /* p1 ^= p2 ^ p3 */ 236 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 237 vld1q_u64(dp3 + 0)); 238 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 239 vld1q_u64(dp3 + 2)); 240 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 241 vld1q_u64(dp3 + 4)); 242 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 243 vld1q_u64(dp3 + 6)); 244 245 /* p1 ^= p4 */ 246 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 247 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 248 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 249 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 250 251 /* store */ 252 vst1q_u64(dp1 + 0, v0); 253 vst1q_u64(dp1 + 2, v1); 254 vst1q_u64(dp1 + 4, v2); 255 vst1q_u64(dp1 + 6, v3); 256 257 dp1 += 8; 258 dp2 += 8; 259 dp3 += 8; 260 dp4 += 8; 261 } while (--lines > 0); 262 } 263 264 static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1, 265 unsigned long *p2, unsigned long *p3, 266 unsigned long *p4, unsigned long *p5) 267 { 268 uint64_t *dp1 = (uint64_t *)p1; 269 uint64_t *dp2 = (uint64_t *)p2; 270 uint64_t *dp3 = (uint64_t *)p3; 271 uint64_t *dp4 = (uint64_t *)p4; 272 uint64_t *dp5 = (uint64_t *)p5; 273 274 register uint64x2_t v0, v1, v2, v3; 275 long lines = bytes / (sizeof(uint64x2_t) * 4); 276 277 do { 278 /* p1 ^= p2 ^ p3 */ 279 v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0), 280 vld1q_u64(dp3 + 0)); 281 v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2), 282 vld1q_u64(dp3 + 2)); 283 v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4), 284 vld1q_u64(dp3 + 4)); 285 v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6), 286 vld1q_u64(dp3 + 6)); 287 288 /* p1 ^= p4 ^ p5 */ 289 v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0)); 290 v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2)); 291 v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4)); 292 v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6)); 293 294 /* store */ 295 vst1q_u64(dp1 + 0, v0); 296 vst1q_u64(dp1 + 2, v1); 297 vst1q_u64(dp1 + 4, v2); 298 vst1q_u64(dp1 + 6, v3); 299 300 dp1 += 8; 301 dp2 += 8; 302 dp3 += 8; 303 dp4 += 8; 304 dp5 += 8; 305 } while (--lines > 0); 306 } 307 308 static int __init xor_neon_init(void) 309 { 310 if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) { 311 xor_block_inner_neon.do_3 = xor_arm64_eor3_3; 312 xor_block_inner_neon.do_4 = xor_arm64_eor3_4; 313 xor_block_inner_neon.do_5 = xor_arm64_eor3_5; 314 } 315 return 0; 316 } 317 module_init(xor_neon_init); 318 319 static void __exit xor_neon_exit(void) 320 { 321 } 322 module_exit(xor_neon_exit); 323 324 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); 325 MODULE_DESCRIPTION("ARMv8 XOR Extensions"); 326 MODULE_LICENSE("GPL"); 327