1*cc9f8349SJackie Liu /* 2*cc9f8349SJackie Liu * arch/arm64/lib/xor-neon.c 3*cc9f8349SJackie Liu * 4*cc9f8349SJackie Liu * Authors: Jackie Liu <liuyun01@kylinos.cn> 5*cc9f8349SJackie Liu * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 6*cc9f8349SJackie Liu * 7*cc9f8349SJackie Liu * This program is free software; you can redistribute it and/or modify 8*cc9f8349SJackie Liu * it under the terms of the GNU General Public License version 2 as 9*cc9f8349SJackie Liu * published by the Free Software Foundation. 10*cc9f8349SJackie Liu */ 11*cc9f8349SJackie Liu 12*cc9f8349SJackie Liu #include <linux/raid/xor.h> 13*cc9f8349SJackie Liu #include <linux/module.h> 14*cc9f8349SJackie Liu #include <asm/neon-intrinsics.h> 15*cc9f8349SJackie Liu 16*cc9f8349SJackie Liu void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, 17*cc9f8349SJackie Liu unsigned long *p2) 18*cc9f8349SJackie Liu { 19*cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 20*cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 21*cc9f8349SJackie Liu 22*cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 23*cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 24*cc9f8349SJackie Liu 25*cc9f8349SJackie Liu do { 26*cc9f8349SJackie Liu /* p1 ^= p2 */ 27*cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 28*cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 29*cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 30*cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 31*cc9f8349SJackie Liu 32*cc9f8349SJackie Liu /* store */ 33*cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 34*cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 35*cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 36*cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 37*cc9f8349SJackie Liu 38*cc9f8349SJackie Liu dp1 += 8; 39*cc9f8349SJackie Liu dp2 += 8; 40*cc9f8349SJackie Liu } while (--lines > 0); 41*cc9f8349SJackie Liu } 42*cc9f8349SJackie Liu 43*cc9f8349SJackie Liu void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, 44*cc9f8349SJackie Liu unsigned long *p2, unsigned long *p3) 45*cc9f8349SJackie Liu { 46*cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 47*cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 48*cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3; 49*cc9f8349SJackie Liu 50*cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 51*cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 52*cc9f8349SJackie Liu 53*cc9f8349SJackie Liu do { 54*cc9f8349SJackie Liu /* p1 ^= p2 */ 55*cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 56*cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 57*cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 58*cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 59*cc9f8349SJackie Liu 60*cc9f8349SJackie Liu /* p1 ^= p3 */ 61*cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 62*cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 63*cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 64*cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 65*cc9f8349SJackie Liu 66*cc9f8349SJackie Liu /* store */ 67*cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 68*cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 69*cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 70*cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 71*cc9f8349SJackie Liu 72*cc9f8349SJackie Liu dp1 += 8; 73*cc9f8349SJackie Liu dp2 += 8; 74*cc9f8349SJackie Liu dp3 += 8; 75*cc9f8349SJackie Liu } while (--lines > 0); 76*cc9f8349SJackie Liu } 77*cc9f8349SJackie Liu 78*cc9f8349SJackie Liu void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, 79*cc9f8349SJackie Liu unsigned long *p2, unsigned long *p3, unsigned long *p4) 80*cc9f8349SJackie Liu { 81*cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 82*cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 83*cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3; 84*cc9f8349SJackie Liu uint64_t *dp4 = (uint64_t *)p4; 85*cc9f8349SJackie Liu 86*cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 87*cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 88*cc9f8349SJackie Liu 89*cc9f8349SJackie Liu do { 90*cc9f8349SJackie Liu /* p1 ^= p2 */ 91*cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 92*cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 93*cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 94*cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 95*cc9f8349SJackie Liu 96*cc9f8349SJackie Liu /* p1 ^= p3 */ 97*cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 98*cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 99*cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 100*cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 101*cc9f8349SJackie Liu 102*cc9f8349SJackie Liu /* p1 ^= p4 */ 103*cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 104*cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 105*cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 106*cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 107*cc9f8349SJackie Liu 108*cc9f8349SJackie Liu /* store */ 109*cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 110*cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 111*cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 112*cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 113*cc9f8349SJackie Liu 114*cc9f8349SJackie Liu dp1 += 8; 115*cc9f8349SJackie Liu dp2 += 8; 116*cc9f8349SJackie Liu dp3 += 8; 117*cc9f8349SJackie Liu dp4 += 8; 118*cc9f8349SJackie Liu } while (--lines > 0); 119*cc9f8349SJackie Liu } 120*cc9f8349SJackie Liu 121*cc9f8349SJackie Liu void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, 122*cc9f8349SJackie Liu unsigned long *p2, unsigned long *p3, 123*cc9f8349SJackie Liu unsigned long *p4, unsigned long *p5) 124*cc9f8349SJackie Liu { 125*cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 126*cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 127*cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3; 128*cc9f8349SJackie Liu uint64_t *dp4 = (uint64_t *)p4; 129*cc9f8349SJackie Liu uint64_t *dp5 = (uint64_t *)p5; 130*cc9f8349SJackie Liu 131*cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 132*cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 133*cc9f8349SJackie Liu 134*cc9f8349SJackie Liu do { 135*cc9f8349SJackie Liu /* p1 ^= p2 */ 136*cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 137*cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 138*cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 139*cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 140*cc9f8349SJackie Liu 141*cc9f8349SJackie Liu /* p1 ^= p3 */ 142*cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 143*cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 144*cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 145*cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 146*cc9f8349SJackie Liu 147*cc9f8349SJackie Liu /* p1 ^= p4 */ 148*cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 149*cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 150*cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 151*cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 152*cc9f8349SJackie Liu 153*cc9f8349SJackie Liu /* p1 ^= p5 */ 154*cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 155*cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 156*cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 157*cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 158*cc9f8349SJackie Liu 159*cc9f8349SJackie Liu /* store */ 160*cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 161*cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 162*cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 163*cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 164*cc9f8349SJackie Liu 165*cc9f8349SJackie Liu dp1 += 8; 166*cc9f8349SJackie Liu dp2 += 8; 167*cc9f8349SJackie Liu dp3 += 8; 168*cc9f8349SJackie Liu dp4 += 8; 169*cc9f8349SJackie Liu dp5 += 8; 170*cc9f8349SJackie Liu } while (--lines > 0); 171*cc9f8349SJackie Liu } 172*cc9f8349SJackie Liu 173*cc9f8349SJackie Liu struct xor_block_template const xor_block_inner_neon = { 174*cc9f8349SJackie Liu .name = "__inner_neon__", 175*cc9f8349SJackie Liu .do_2 = xor_arm64_neon_2, 176*cc9f8349SJackie Liu .do_3 = xor_arm64_neon_3, 177*cc9f8349SJackie Liu .do_4 = xor_arm64_neon_4, 178*cc9f8349SJackie Liu .do_5 = xor_arm64_neon_5, 179*cc9f8349SJackie Liu }; 180*cc9f8349SJackie Liu EXPORT_SYMBOL(xor_block_inner_neon); 181*cc9f8349SJackie Liu 182*cc9f8349SJackie Liu MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); 183*cc9f8349SJackie Liu MODULE_DESCRIPTION("ARMv8 XOR Extensions"); 184*cc9f8349SJackie Liu MODULE_LICENSE("GPL"); 185