1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * arch/arm64/lib/xor-neon.c 4 * 5 * Authors: Jackie Liu <liuyun01@kylinos.cn> 6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 7 */ 8 9 #include <linux/raid/xor.h> 10 #include <linux/module.h> 11 #include <asm/neon-intrinsics.h> 12 13 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, 14 unsigned long *p2) 15 { 16 uint64_t *dp1 = (uint64_t *)p1; 17 uint64_t *dp2 = (uint64_t *)p2; 18 19 register uint64x2_t v0, v1, v2, v3; 20 long lines = bytes / (sizeof(uint64x2_t) * 4); 21 22 do { 23 /* p1 ^= p2 */ 24 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 25 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 26 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 27 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 28 29 /* store */ 30 vst1q_u64(dp1 + 0, v0); 31 vst1q_u64(dp1 + 2, v1); 32 vst1q_u64(dp1 + 4, v2); 33 vst1q_u64(dp1 + 6, v3); 34 35 dp1 += 8; 36 dp2 += 8; 37 } while (--lines > 0); 38 } 39 40 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, 41 unsigned long *p2, unsigned long *p3) 42 { 43 uint64_t *dp1 = (uint64_t *)p1; 44 uint64_t *dp2 = (uint64_t *)p2; 45 uint64_t *dp3 = (uint64_t *)p3; 46 47 register uint64x2_t v0, v1, v2, v3; 48 long lines = bytes / (sizeof(uint64x2_t) * 4); 49 50 do { 51 /* p1 ^= p2 */ 52 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 53 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 54 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 55 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 56 57 /* p1 ^= p3 */ 58 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 59 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 60 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 61 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 62 63 /* store */ 64 vst1q_u64(dp1 + 0, v0); 65 vst1q_u64(dp1 + 2, v1); 66 vst1q_u64(dp1 + 4, v2); 67 vst1q_u64(dp1 + 6, v3); 68 69 dp1 += 8; 70 dp2 += 8; 71 dp3 += 8; 72 } while (--lines > 0); 73 } 74 75 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, 76 unsigned long *p2, unsigned long *p3, unsigned long *p4) 77 { 78 uint64_t *dp1 = (uint64_t *)p1; 79 uint64_t *dp2 = (uint64_t *)p2; 80 uint64_t *dp3 = (uint64_t *)p3; 81 uint64_t *dp4 = (uint64_t *)p4; 82 83 register uint64x2_t v0, v1, v2, v3; 84 long lines = bytes / (sizeof(uint64x2_t) * 4); 85 86 do { 87 /* p1 ^= p2 */ 88 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 89 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 90 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 91 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 92 93 /* p1 ^= p3 */ 94 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 95 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 96 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 97 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 98 99 /* p1 ^= p4 */ 100 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 101 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 102 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 103 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 104 105 /* store */ 106 vst1q_u64(dp1 + 0, v0); 107 vst1q_u64(dp1 + 2, v1); 108 vst1q_u64(dp1 + 4, v2); 109 vst1q_u64(dp1 + 6, v3); 110 111 dp1 += 8; 112 dp2 += 8; 113 dp3 += 8; 114 dp4 += 8; 115 } while (--lines > 0); 116 } 117 118 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, 119 unsigned long *p2, unsigned long *p3, 120 unsigned long *p4, unsigned long *p5) 121 { 122 uint64_t *dp1 = (uint64_t *)p1; 123 uint64_t *dp2 = (uint64_t *)p2; 124 uint64_t *dp3 = (uint64_t *)p3; 125 uint64_t *dp4 = (uint64_t *)p4; 126 uint64_t *dp5 = (uint64_t *)p5; 127 128 register uint64x2_t v0, v1, v2, v3; 129 long lines = bytes / (sizeof(uint64x2_t) * 4); 130 131 do { 132 /* p1 ^= p2 */ 133 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 134 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 135 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 136 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 137 138 /* p1 ^= p3 */ 139 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 140 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 141 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 142 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 143 144 /* p1 ^= p4 */ 145 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 146 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 147 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 148 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 149 150 /* p1 ^= p5 */ 151 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 152 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 153 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 154 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 155 156 /* store */ 157 vst1q_u64(dp1 + 0, v0); 158 vst1q_u64(dp1 + 2, v1); 159 vst1q_u64(dp1 + 4, v2); 160 vst1q_u64(dp1 + 6, v3); 161 162 dp1 += 8; 163 dp2 += 8; 164 dp3 += 8; 165 dp4 += 8; 166 dp5 += 8; 167 } while (--lines > 0); 168 } 169 170 struct xor_block_template const xor_block_inner_neon = { 171 .name = "__inner_neon__", 172 .do_2 = xor_arm64_neon_2, 173 .do_3 = xor_arm64_neon_3, 174 .do_4 = xor_arm64_neon_4, 175 .do_5 = xor_arm64_neon_5, 176 }; 177 EXPORT_SYMBOL(xor_block_inner_neon); 178 179 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); 180 MODULE_DESCRIPTION("ARMv8 XOR Extensions"); 181 MODULE_LICENSE("GPL"); 182