1*d2912cb1SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only 2cc9f8349SJackie Liu /* 3cc9f8349SJackie Liu * arch/arm64/lib/xor-neon.c 4cc9f8349SJackie Liu * 5cc9f8349SJackie Liu * Authors: Jackie Liu <liuyun01@kylinos.cn> 6cc9f8349SJackie Liu * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. 7cc9f8349SJackie Liu */ 8cc9f8349SJackie Liu 9cc9f8349SJackie Liu #include <linux/raid/xor.h> 10cc9f8349SJackie Liu #include <linux/module.h> 11cc9f8349SJackie Liu #include <asm/neon-intrinsics.h> 12cc9f8349SJackie Liu 13cc9f8349SJackie Liu void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, 14cc9f8349SJackie Liu unsigned long *p2) 15cc9f8349SJackie Liu { 16cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 17cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 18cc9f8349SJackie Liu 19cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 20cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 21cc9f8349SJackie Liu 22cc9f8349SJackie Liu do { 23cc9f8349SJackie Liu /* p1 ^= p2 */ 24cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 25cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 26cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 27cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 28cc9f8349SJackie Liu 29cc9f8349SJackie Liu /* store */ 30cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 31cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 32cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 33cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 34cc9f8349SJackie Liu 35cc9f8349SJackie Liu dp1 += 8; 36cc9f8349SJackie Liu dp2 += 8; 37cc9f8349SJackie Liu } while (--lines > 0); 38cc9f8349SJackie Liu } 39cc9f8349SJackie Liu 40cc9f8349SJackie Liu void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, 41cc9f8349SJackie Liu unsigned long *p2, unsigned long *p3) 42cc9f8349SJackie Liu { 43cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 44cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 45cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3; 46cc9f8349SJackie Liu 47cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 48cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 49cc9f8349SJackie Liu 50cc9f8349SJackie Liu do { 51cc9f8349SJackie Liu /* p1 ^= p2 */ 52cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 53cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 54cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 55cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 56cc9f8349SJackie Liu 57cc9f8349SJackie Liu /* p1 ^= p3 */ 58cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 59cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 60cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 61cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 62cc9f8349SJackie Liu 63cc9f8349SJackie Liu /* store */ 64cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 65cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 66cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 67cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 68cc9f8349SJackie Liu 69cc9f8349SJackie Liu dp1 += 8; 70cc9f8349SJackie Liu dp2 += 8; 71cc9f8349SJackie Liu dp3 += 8; 72cc9f8349SJackie Liu } while (--lines > 0); 73cc9f8349SJackie Liu } 74cc9f8349SJackie Liu 75cc9f8349SJackie Liu void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, 76cc9f8349SJackie Liu unsigned long *p2, unsigned long *p3, unsigned long *p4) 77cc9f8349SJackie Liu { 78cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 79cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 80cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3; 81cc9f8349SJackie Liu uint64_t *dp4 = (uint64_t *)p4; 82cc9f8349SJackie Liu 83cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 84cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 85cc9f8349SJackie Liu 86cc9f8349SJackie Liu do { 87cc9f8349SJackie Liu /* p1 ^= p2 */ 88cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 89cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 90cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 91cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 92cc9f8349SJackie Liu 93cc9f8349SJackie Liu /* p1 ^= p3 */ 94cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 95cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 96cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 97cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 98cc9f8349SJackie Liu 99cc9f8349SJackie Liu /* p1 ^= p4 */ 100cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 101cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 102cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 103cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 104cc9f8349SJackie Liu 105cc9f8349SJackie Liu /* store */ 106cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 107cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 108cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 109cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 110cc9f8349SJackie Liu 111cc9f8349SJackie Liu dp1 += 8; 112cc9f8349SJackie Liu dp2 += 8; 113cc9f8349SJackie Liu dp3 += 8; 114cc9f8349SJackie Liu dp4 += 8; 115cc9f8349SJackie Liu } while (--lines > 0); 116cc9f8349SJackie Liu } 117cc9f8349SJackie Liu 118cc9f8349SJackie Liu void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, 119cc9f8349SJackie Liu unsigned long *p2, unsigned long *p3, 120cc9f8349SJackie Liu unsigned long *p4, unsigned long *p5) 121cc9f8349SJackie Liu { 122cc9f8349SJackie Liu uint64_t *dp1 = (uint64_t *)p1; 123cc9f8349SJackie Liu uint64_t *dp2 = (uint64_t *)p2; 124cc9f8349SJackie Liu uint64_t *dp3 = (uint64_t *)p3; 125cc9f8349SJackie Liu uint64_t *dp4 = (uint64_t *)p4; 126cc9f8349SJackie Liu uint64_t *dp5 = (uint64_t *)p5; 127cc9f8349SJackie Liu 128cc9f8349SJackie Liu register uint64x2_t v0, v1, v2, v3; 129cc9f8349SJackie Liu long lines = bytes / (sizeof(uint64x2_t) * 4); 130cc9f8349SJackie Liu 131cc9f8349SJackie Liu do { 132cc9f8349SJackie Liu /* p1 ^= p2 */ 133cc9f8349SJackie Liu v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); 134cc9f8349SJackie Liu v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); 135cc9f8349SJackie Liu v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); 136cc9f8349SJackie Liu v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); 137cc9f8349SJackie Liu 138cc9f8349SJackie Liu /* p1 ^= p3 */ 139cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); 140cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); 141cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); 142cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); 143cc9f8349SJackie Liu 144cc9f8349SJackie Liu /* p1 ^= p4 */ 145cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); 146cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); 147cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); 148cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); 149cc9f8349SJackie Liu 150cc9f8349SJackie Liu /* p1 ^= p5 */ 151cc9f8349SJackie Liu v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); 152cc9f8349SJackie Liu v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); 153cc9f8349SJackie Liu v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); 154cc9f8349SJackie Liu v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); 155cc9f8349SJackie Liu 156cc9f8349SJackie Liu /* store */ 157cc9f8349SJackie Liu vst1q_u64(dp1 + 0, v0); 158cc9f8349SJackie Liu vst1q_u64(dp1 + 2, v1); 159cc9f8349SJackie Liu vst1q_u64(dp1 + 4, v2); 160cc9f8349SJackie Liu vst1q_u64(dp1 + 6, v3); 161cc9f8349SJackie Liu 162cc9f8349SJackie Liu dp1 += 8; 163cc9f8349SJackie Liu dp2 += 8; 164cc9f8349SJackie Liu dp3 += 8; 165cc9f8349SJackie Liu dp4 += 8; 166cc9f8349SJackie Liu dp5 += 8; 167cc9f8349SJackie Liu } while (--lines > 0); 168cc9f8349SJackie Liu } 169cc9f8349SJackie Liu 170cc9f8349SJackie Liu struct xor_block_template const xor_block_inner_neon = { 171cc9f8349SJackie Liu .name = "__inner_neon__", 172cc9f8349SJackie Liu .do_2 = xor_arm64_neon_2, 173cc9f8349SJackie Liu .do_3 = xor_arm64_neon_3, 174cc9f8349SJackie Liu .do_4 = xor_arm64_neon_4, 175cc9f8349SJackie Liu .do_5 = xor_arm64_neon_5, 176cc9f8349SJackie Liu }; 177cc9f8349SJackie Liu EXPORT_SYMBOL(xor_block_inner_neon); 178cc9f8349SJackie Liu 179cc9f8349SJackie Liu MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); 180cc9f8349SJackie Liu MODULE_DESCRIPTION("ARMv8 XOR Extensions"); 181cc9f8349SJackie Liu MODULE_LICENSE("GPL"); 182