1 /* 2 * Copyright (C) 2012 Intel Corporation 3 * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License 7 * as published by the Free Software Foundation; version 2 8 * of the License. 9 */ 10 11 #include <arm_neon.h> 12 13 #ifdef CONFIG_ARM 14 /* 15 * AArch32 does not provide this intrinsic natively because it does not 16 * implement the underlying instruction. AArch32 only provides a 64-bit 17 * wide vtbl.8 instruction, so use that instead. 18 */ 19 static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) 20 { 21 union { 22 uint8x16_t val; 23 uint8x8x2_t pair; 24 } __a = { a }; 25 26 return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), 27 vtbl2_u8(__a.pair, vget_high_u8(b))); 28 } 29 #endif 30 31 void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, 32 uint8_t *dq, const uint8_t *pbmul, 33 const uint8_t *qmul) 34 { 35 uint8x16_t pm0 = vld1q_u8(pbmul); 36 uint8x16_t pm1 = vld1q_u8(pbmul + 16); 37 uint8x16_t qm0 = vld1q_u8(qmul); 38 uint8x16_t qm1 = vld1q_u8(qmul + 16); 39 uint8x16_t x0f = vdupq_n_u8(0x0f); 40 41 /* 42 * while ( bytes-- ) { 43 * uint8_t px, qx, db; 44 * 45 * px = *p ^ *dp; 46 * qx = qmul[*q ^ *dq]; 47 * *dq++ = db = pbmul[px] ^ qx; 48 * *dp++ = db ^ px; 49 * p++; q++; 50 * } 51 */ 52 53 while (bytes) { 54 uint8x16_t vx, vy, px, qx, db; 55 56 px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); 57 vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); 58 59 vy = vshrq_n_u8(vx, 4); 60 vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); 61 vy = vqtbl1q_u8(qm1, vy); 62 qx = veorq_u8(vx, vy); 63 64 vy = vshrq_n_u8(px, 4); 65 vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); 66 vy = vqtbl1q_u8(pm1, vy); 67 vx = veorq_u8(vx, vy); 68 db = veorq_u8(vx, qx); 69 70 vst1q_u8(dq, db); 71 vst1q_u8(dp, veorq_u8(db, px)); 72 73 bytes -= 16; 74 p += 16; 75 q += 16; 76 dp += 16; 77 dq += 16; 78 } 79 } 80 81 void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, 82 const uint8_t *qmul) 83 { 84 uint8x16_t qm0 = vld1q_u8(qmul); 85 uint8x16_t qm1 = vld1q_u8(qmul + 16); 86 uint8x16_t x0f = vdupq_n_u8(0x0f); 87 88 /* 89 * while (bytes--) { 90 * *p++ ^= *dq = qmul[*q ^ *dq]; 91 * q++; dq++; 92 * } 93 */ 94 95 while (bytes) { 96 uint8x16_t vx, vy; 97 98 vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); 99 100 vy = vshrq_n_u8(vx, 4); 101 vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); 102 vy = vqtbl1q_u8(qm1, vy); 103 vx = veorq_u8(vx, vy); 104 vy = veorq_u8(vx, vld1q_u8(p)); 105 106 vst1q_u8(dq, vx); 107 vst1q_u8(p, vy); 108 109 bytes -= 16; 110 p += 16; 111 q += 16; 112 dq += 16; 113 } 114 } 115