1 /* 2 * arch/arm64/lib/xor-neon.c 3 * 4 * Authors: Jackie Liu <liuyun01@kylinos.cn> 5 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 12 #include <linux/raid/xor.h> 13 #include <linux/module.h> 14 #include <asm/neon-intrinsics.h> 15 16 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, 17 unsigned long *p2) 18 { 19 uint64_t *dp1 = (uint64_t *)p1; 20 uint64_t *dp2 = (uint64_t *)p2; 21 22 register uint64x2_t v0, v1, v2, v3; 23 long lines = bytes / (sizeof(uint64x2_t) * 4); 24 25 do { 26 /* p1 ^= p2 */ 27 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 28 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 29 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 30 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 31 32 /* store */ 33 vst1q_u64(dp1 + 0, v0); 34 vst1q_u64(dp1 + 2, v1); 35 vst1q_u64(dp1 + 4, v2); 36 vst1q_u64(dp1 + 6, v3); 37 38 dp1 += 8; 39 dp2 += 8; 40 } while (--lines > 0); 41 } 42 43 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, 44 unsigned long *p2, unsigned long *p3) 45 { 46 uint64_t *dp1 = (uint64_t *)p1; 47 uint64_t *dp2 = (uint64_t *)p2; 48 uint64_t *dp3 = (uint64_t *)p3; 49 50 register uint64x2_t v0, v1, v2, v3; 51 long lines = bytes / (sizeof(uint64x2_t) * 4); 52 53 do { 54 /* p1 ^= p2 */ 55 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 56 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 57 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 58 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 59 60 /* p1 ^= p3 */ 61 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 62 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 63 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 64 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 65 66 /* store */ 67 vst1q_u64(dp1 + 0, v0); 68 vst1q_u64(dp1 + 2, v1); 69 vst1q_u64(dp1 + 4, v2); 70 vst1q_u64(dp1 + 6, v3); 71 72 dp1 += 8; 73 dp2 += 8; 74 dp3 += 8; 75 } while (--lines > 0); 76 } 77 78 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, 79 unsigned long *p2, unsigned long *p3, unsigned long *p4) 80 { 81 uint64_t *dp1 = (uint64_t *)p1; 82 uint64_t *dp2 = (uint64_t *)p2; 83 uint64_t *dp3 = (uint64_t *)p3; 84 uint64_t *dp4 = (uint64_t *)p4; 85 86 register uint64x2_t v0, v1, v2, v3; 87 long lines = bytes / (sizeof(uint64x2_t) * 4); 88 89 do { 90 /* p1 ^= p2 */ 91 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 92 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 93 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 94 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 95 96 /* p1 ^= p3 */ 97 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 98 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 99 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 100 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 101 102 /* p1 ^= p4 */ 103 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 104 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 105 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 106 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 107 108 /* store */ 109 vst1q_u64(dp1 + 0, v0); 110 vst1q_u64(dp1 + 2, v1); 111 vst1q_u64(dp1 + 4, v2); 112 vst1q_u64(dp1 + 6, v3); 113 114 dp1 += 8; 115 dp2 += 8; 116 dp3 += 8; 117 dp4 += 8; 118 } while (--lines > 0); 119 } 120 121 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, 122 unsigned long *p2, unsigned long *p3, 123 unsigned long *p4, unsigned long *p5) 124 { 125 uint64_t *dp1 = (uint64_t *)p1; 126 uint64_t *dp2 = (uint64_t *)p2; 127 uint64_t *dp3 = (uint64_t *)p3; 128 uint64_t *dp4 = (uint64_t *)p4; 129 uint64_t *dp5 = (uint64_t *)p5; 130 131 register uint64x2_t v0, v1, v2, v3; 132 long lines = bytes / (sizeof(uint64x2_t) * 4); 133 134 do { 135 /* p1 ^= p2 */ 136 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 137 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 138 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 139 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 140 141 /* p1 ^= p3 */ 142 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 143 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 144 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 145 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 146 147 /* p1 ^= p4 */ 148 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 149 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 150 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 151 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 152 153 /* p1 ^= p5 */ 154 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 155 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 156 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 157 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 158 159 /* store */ 160 vst1q_u64(dp1 + 0, v0); 161 vst1q_u64(dp1 + 2, v1); 162 vst1q_u64(dp1 + 4, v2); 163 vst1q_u64(dp1 + 6, v3); 164 165 dp1 += 8; 166 dp2 += 8; 167 dp3 += 8; 168 dp4 += 8; 169 dp5 += 8; 170 } while (--lines > 0); 171 } 172 173 struct xor_block_template const xor_block_inner_neon = { 174 .name = "__inner_neon__", 175 .do_2 = xor_arm64_neon_2, 176 .do_3 = xor_arm64_neon_3, 177 .do_4 = xor_arm64_neon_4, 178 .do_5 = xor_arm64_neon_5, 179 }; 180 EXPORT_SYMBOL(xor_block_inner_neon); 181 182 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); 183 MODULE_DESCRIPTION("ARMv8 XOR Extensions"); 184 MODULE_LICENSE("GPL"); 185