xref: /openbmc/linux/lib/raid6/avx2.c (revision 36dacddb)
1dd165a65SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-or-later
22c935842SYuanhan Liu /* -*- linux-c -*- ------------------------------------------------------- *
32c935842SYuanhan Liu  *
42c935842SYuanhan Liu  *   Copyright (C) 2012 Intel Corporation
52c935842SYuanhan Liu  *   Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
62c935842SYuanhan Liu  *
72c935842SYuanhan Liu  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
82c935842SYuanhan Liu  *
92c935842SYuanhan Liu  * ----------------------------------------------------------------------- */
102c935842SYuanhan Liu 
112c935842SYuanhan Liu /*
122c935842SYuanhan Liu  * AVX2 implementation of RAID-6 syndrome functions
132c935842SYuanhan Liu  *
142c935842SYuanhan Liu  */
152c935842SYuanhan Liu 
162c935842SYuanhan Liu #include <linux/raid/pq.h>
172c935842SYuanhan Liu #include "x86.h"
182c935842SYuanhan Liu 
192c935842SYuanhan Liu static const struct raid6_avx2_constants {
202c935842SYuanhan Liu 	u64 x1d[4];
212c935842SYuanhan Liu } raid6_avx2_constants __aligned(32) = {
222c935842SYuanhan Liu 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
232c935842SYuanhan Liu 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
242c935842SYuanhan Liu };
252c935842SYuanhan Liu 
raid6_have_avx2(void)262c935842SYuanhan Liu static int raid6_have_avx2(void)
272c935842SYuanhan Liu {
282c935842SYuanhan Liu 	return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
292c935842SYuanhan Liu }
302c935842SYuanhan Liu 
312c935842SYuanhan Liu /*
322c935842SYuanhan Liu  * Plain AVX2 implementation
332c935842SYuanhan Liu  */
raid6_avx21_gen_syndrome(int disks,size_t bytes,void ** ptrs)342c935842SYuanhan Liu static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
352c935842SYuanhan Liu {
362c935842SYuanhan Liu 	u8 **dptr = (u8 **)ptrs;
372c935842SYuanhan Liu 	u8 *p, *q;
382c935842SYuanhan Liu 	int d, z, z0;
392c935842SYuanhan Liu 
402c935842SYuanhan Liu 	z0 = disks - 3;		/* Highest data disk */
412c935842SYuanhan Liu 	p = dptr[z0+1];		/* XOR parity */
422c935842SYuanhan Liu 	q = dptr[z0+2];		/* RS syndrome */
432c935842SYuanhan Liu 
442c935842SYuanhan Liu 	kernel_fpu_begin();
452c935842SYuanhan Liu 
462c935842SYuanhan Liu 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
472c935842SYuanhan Liu 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* Zero temp */
482c935842SYuanhan Liu 
492c935842SYuanhan Liu 	for (d = 0; d < bytes; d += 32) {
502c935842SYuanhan Liu 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
512c935842SYuanhan Liu 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
522c935842SYuanhan Liu 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
532c935842SYuanhan Liu 		asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
542c935842SYuanhan Liu 		asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
552c935842SYuanhan Liu 		for (z = z0-2; z >= 0; z--) {
562c935842SYuanhan Liu 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
572c935842SYuanhan Liu 			asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
582c935842SYuanhan Liu 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
592c935842SYuanhan Liu 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
602c935842SYuanhan Liu 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
612c935842SYuanhan Liu 			asm volatile("vpxor %ymm6,%ymm2,%ymm2");
622c935842SYuanhan Liu 			asm volatile("vpxor %ymm6,%ymm4,%ymm4");
632c935842SYuanhan Liu 			asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
642c935842SYuanhan Liu 		}
652c935842SYuanhan Liu 		asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
662c935842SYuanhan Liu 		asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
672c935842SYuanhan Liu 		asm volatile("vpand %ymm0,%ymm5,%ymm5");
682c935842SYuanhan Liu 		asm volatile("vpxor %ymm5,%ymm4,%ymm4");
692c935842SYuanhan Liu 		asm volatile("vpxor %ymm6,%ymm2,%ymm2");
702c935842SYuanhan Liu 		asm volatile("vpxor %ymm6,%ymm4,%ymm4");
712c935842SYuanhan Liu 
722c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
732c935842SYuanhan Liu 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
742c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
752c935842SYuanhan Liu 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
762c935842SYuanhan Liu 	}
772c935842SYuanhan Liu 
782c935842SYuanhan Liu 	asm volatile("sfence" : : : "memory");
792c935842SYuanhan Liu 	kernel_fpu_end();
802c935842SYuanhan Liu }
812c935842SYuanhan Liu 
raid6_avx21_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)82b9bf33a8SGayatri Kammela static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
83b9bf33a8SGayatri Kammela 				     size_t bytes, void **ptrs)
84b9bf33a8SGayatri Kammela {
85b9bf33a8SGayatri Kammela 	u8 **dptr = (u8 **)ptrs;
86b9bf33a8SGayatri Kammela 	u8 *p, *q;
87b9bf33a8SGayatri Kammela 	int d, z, z0;
88b9bf33a8SGayatri Kammela 
89b9bf33a8SGayatri Kammela 	z0 = stop;		/* P/Q right side optimization */
90b9bf33a8SGayatri Kammela 	p = dptr[disks-2];	/* XOR parity */
91b9bf33a8SGayatri Kammela 	q = dptr[disks-1];	/* RS syndrome */
92b9bf33a8SGayatri Kammela 
93b9bf33a8SGayatri Kammela 	kernel_fpu_begin();
94b9bf33a8SGayatri Kammela 
95b9bf33a8SGayatri Kammela 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
96b9bf33a8SGayatri Kammela 
97b9bf33a8SGayatri Kammela 	for (d = 0 ; d < bytes ; d += 32) {
98b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
99b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
100b9bf33a8SGayatri Kammela 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
101b9bf33a8SGayatri Kammela 		/* P/Q data pages */
102b9bf33a8SGayatri Kammela 		for (z = z0-1 ; z >= start ; z--) {
103b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
104b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
105b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
106b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
107b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
108b9bf33a8SGayatri Kammela 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
109b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
110b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
111b9bf33a8SGayatri Kammela 		}
112b9bf33a8SGayatri Kammela 		/* P/Q left side optimization */
113b9bf33a8SGayatri Kammela 		for (z = start-1 ; z >= 0 ; z--) {
114b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
115b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
116b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
117b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
118b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
119b9bf33a8SGayatri Kammela 		}
120b9bf33a8SGayatri Kammela 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
121b9bf33a8SGayatri Kammela 		/* Don't use movntdq for r/w memory area < cache line */
122b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
123b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
124b9bf33a8SGayatri Kammela 	}
125b9bf33a8SGayatri Kammela 
126b9bf33a8SGayatri Kammela 	asm volatile("sfence" : : : "memory");
127b9bf33a8SGayatri Kammela 	kernel_fpu_end();
128b9bf33a8SGayatri Kammela }
129b9bf33a8SGayatri Kammela 
1302c935842SYuanhan Liu const struct raid6_calls raid6_avx2x1 = {
1312c935842SYuanhan Liu 	raid6_avx21_gen_syndrome,
132b9bf33a8SGayatri Kammela 	raid6_avx21_xor_syndrome,
1332c935842SYuanhan Liu 	raid6_have_avx2,
1342c935842SYuanhan Liu 	"avx2x1",
135*36dacddbSDirk Müller 	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
1362c935842SYuanhan Liu };
1372c935842SYuanhan Liu 
1382c935842SYuanhan Liu /*
1392c935842SYuanhan Liu  * Unrolled-by-2 AVX2 implementation
1402c935842SYuanhan Liu  */
raid6_avx22_gen_syndrome(int disks,size_t bytes,void ** ptrs)1412c935842SYuanhan Liu static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
1422c935842SYuanhan Liu {
1432c935842SYuanhan Liu 	u8 **dptr = (u8 **)ptrs;
1442c935842SYuanhan Liu 	u8 *p, *q;
1452c935842SYuanhan Liu 	int d, z, z0;
1462c935842SYuanhan Liu 
1472c935842SYuanhan Liu 	z0 = disks - 3;		/* Highest data disk */
1482c935842SYuanhan Liu 	p = dptr[z0+1];		/* XOR parity */
1492c935842SYuanhan Liu 	q = dptr[z0+2];		/* RS syndrome */
1502c935842SYuanhan Liu 
1512c935842SYuanhan Liu 	kernel_fpu_begin();
1522c935842SYuanhan Liu 
1532c935842SYuanhan Liu 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
1542c935842SYuanhan Liu 	asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
1552c935842SYuanhan Liu 
1562c935842SYuanhan Liu 	/* We uniformly assume a single prefetch covers at least 32 bytes */
1572c935842SYuanhan Liu 	for (d = 0; d < bytes; d += 64) {
1582c935842SYuanhan Liu 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
1592c935842SYuanhan Liu 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
1602c935842SYuanhan Liu 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
1612c935842SYuanhan Liu 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
1622c935842SYuanhan Liu 		asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
1632c935842SYuanhan Liu 		asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
1642c935842SYuanhan Liu 		for (z = z0-1; z >= 0; z--) {
1652c935842SYuanhan Liu 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
1662c935842SYuanhan Liu 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
1672c935842SYuanhan Liu 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
1682c935842SYuanhan Liu 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
1692c935842SYuanhan Liu 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
1702c935842SYuanhan Liu 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
1712c935842SYuanhan Liu 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
1722c935842SYuanhan Liu 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
1732c935842SYuanhan Liu 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
1742c935842SYuanhan Liu 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
1752c935842SYuanhan Liu 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
1762c935842SYuanhan Liu 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
1772c935842SYuanhan Liu 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
1782c935842SYuanhan Liu 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
1792c935842SYuanhan Liu 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
1802c935842SYuanhan Liu 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
1812c935842SYuanhan Liu 		}
1822c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
1832c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
1842c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
1852c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
1862c935842SYuanhan Liu 	}
1872c935842SYuanhan Liu 
1882c935842SYuanhan Liu 	asm volatile("sfence" : : : "memory");
1892c935842SYuanhan Liu 	kernel_fpu_end();
1902c935842SYuanhan Liu }
1912c935842SYuanhan Liu 
raid6_avx22_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)192b9bf33a8SGayatri Kammela static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
193b9bf33a8SGayatri Kammela 				     size_t bytes, void **ptrs)
194b9bf33a8SGayatri Kammela {
195b9bf33a8SGayatri Kammela 	u8 **dptr = (u8 **)ptrs;
196b9bf33a8SGayatri Kammela 	u8 *p, *q;
197b9bf33a8SGayatri Kammela 	int d, z, z0;
198b9bf33a8SGayatri Kammela 
199b9bf33a8SGayatri Kammela 	z0 = stop;		/* P/Q right side optimization */
200b9bf33a8SGayatri Kammela 	p = dptr[disks-2];	/* XOR parity */
201b9bf33a8SGayatri Kammela 	q = dptr[disks-1];	/* RS syndrome */
202b9bf33a8SGayatri Kammela 
203b9bf33a8SGayatri Kammela 	kernel_fpu_begin();
204b9bf33a8SGayatri Kammela 
205b9bf33a8SGayatri Kammela 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
206b9bf33a8SGayatri Kammela 
207b9bf33a8SGayatri Kammela 	for (d = 0 ; d < bytes ; d += 64) {
208b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
209b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
210b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
211b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
212b9bf33a8SGayatri Kammela 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
213b9bf33a8SGayatri Kammela 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
214b9bf33a8SGayatri Kammela 		/* P/Q data pages */
215b9bf33a8SGayatri Kammela 		for (z = z0-1 ; z >= start ; z--) {
216b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
217b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
218b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
219b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
220b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
221b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
222b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
223b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
224b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
225b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
226b9bf33a8SGayatri Kammela 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
227b9bf33a8SGayatri Kammela 			asm volatile("vmovdqa %0,%%ymm7"
228b9bf33a8SGayatri Kammela 				     :: "m" (dptr[z][d+32]));
229b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
230b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
231b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
232b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
233b9bf33a8SGayatri Kammela 		}
234b9bf33a8SGayatri Kammela 		/* P/Q left side optimization */
235b9bf33a8SGayatri Kammela 		for (z = start-1 ; z >= 0 ; z--) {
236b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
237b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
238b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
239b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
240b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
241b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
242b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
243b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
244b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
245b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
246b9bf33a8SGayatri Kammela 		}
247b9bf33a8SGayatri Kammela 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
248b9bf33a8SGayatri Kammela 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
249b9bf33a8SGayatri Kammela 		/* Don't use movntdq for r/w memory area < cache line */
250b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
251b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
252b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
253b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
254b9bf33a8SGayatri Kammela 	}
255b9bf33a8SGayatri Kammela 
256b9bf33a8SGayatri Kammela 	asm volatile("sfence" : : : "memory");
257b9bf33a8SGayatri Kammela 	kernel_fpu_end();
258b9bf33a8SGayatri Kammela }
259b9bf33a8SGayatri Kammela 
2602c935842SYuanhan Liu const struct raid6_calls raid6_avx2x2 = {
2612c935842SYuanhan Liu 	raid6_avx22_gen_syndrome,
262b9bf33a8SGayatri Kammela 	raid6_avx22_xor_syndrome,
2632c935842SYuanhan Liu 	raid6_have_avx2,
2642c935842SYuanhan Liu 	"avx2x2",
265*36dacddbSDirk Müller 	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
2662c935842SYuanhan Liu };
2672c935842SYuanhan Liu 
2682c935842SYuanhan Liu #ifdef CONFIG_X86_64
2692c935842SYuanhan Liu 
2702c935842SYuanhan Liu /*
2712c935842SYuanhan Liu  * Unrolled-by-4 AVX2 implementation
2722c935842SYuanhan Liu  */
raid6_avx24_gen_syndrome(int disks,size_t bytes,void ** ptrs)2732c935842SYuanhan Liu static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
2742c935842SYuanhan Liu {
2752c935842SYuanhan Liu 	u8 **dptr = (u8 **)ptrs;
2762c935842SYuanhan Liu 	u8 *p, *q;
2772c935842SYuanhan Liu 	int d, z, z0;
2782c935842SYuanhan Liu 
2792c935842SYuanhan Liu 	z0 = disks - 3;		/* Highest data disk */
2802c935842SYuanhan Liu 	p = dptr[z0+1];		/* XOR parity */
2812c935842SYuanhan Liu 	q = dptr[z0+2];		/* RS syndrome */
2822c935842SYuanhan Liu 
2832c935842SYuanhan Liu 	kernel_fpu_begin();
2842c935842SYuanhan Liu 
2852c935842SYuanhan Liu 	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
2862c935842SYuanhan Liu 	asm volatile("vpxor %ymm1,%ymm1,%ymm1");	/* Zero temp */
2872c935842SYuanhan Liu 	asm volatile("vpxor %ymm2,%ymm2,%ymm2");	/* P[0] */
2882c935842SYuanhan Liu 	asm volatile("vpxor %ymm3,%ymm3,%ymm3");	/* P[1] */
2892c935842SYuanhan Liu 	asm volatile("vpxor %ymm4,%ymm4,%ymm4");	/* Q[0] */
2902c935842SYuanhan Liu 	asm volatile("vpxor %ymm6,%ymm6,%ymm6");	/* Q[1] */
2912c935842SYuanhan Liu 	asm volatile("vpxor %ymm10,%ymm10,%ymm10");	/* P[2] */
2922c935842SYuanhan Liu 	asm volatile("vpxor %ymm11,%ymm11,%ymm11");	/* P[3] */
2932c935842SYuanhan Liu 	asm volatile("vpxor %ymm12,%ymm12,%ymm12");	/* Q[2] */
2942c935842SYuanhan Liu 	asm volatile("vpxor %ymm14,%ymm14,%ymm14");	/* Q[3] */
2952c935842SYuanhan Liu 
2962c935842SYuanhan Liu 	for (d = 0; d < bytes; d += 128) {
2972c935842SYuanhan Liu 		for (z = z0; z >= 0; z--) {
2982c935842SYuanhan Liu 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
2992c935842SYuanhan Liu 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
3002c935842SYuanhan Liu 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
3012c935842SYuanhan Liu 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
3022c935842SYuanhan Liu 			asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
3032c935842SYuanhan Liu 			asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
3042c935842SYuanhan Liu 			asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
3052c935842SYuanhan Liu 			asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
3062c935842SYuanhan Liu 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
3072c935842SYuanhan Liu 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
3082c935842SYuanhan Liu 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
3092c935842SYuanhan Liu 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
3102c935842SYuanhan Liu 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
3112c935842SYuanhan Liu 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
3122c935842SYuanhan Liu 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
3132c935842SYuanhan Liu 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
3142c935842SYuanhan Liu 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
3152c935842SYuanhan Liu 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
3162c935842SYuanhan Liu 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
3172c935842SYuanhan Liu 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
3182c935842SYuanhan Liu 			asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
3192c935842SYuanhan Liu 			asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
3202c935842SYuanhan Liu 			asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
3212c935842SYuanhan Liu 			asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
3222c935842SYuanhan Liu 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
3232c935842SYuanhan Liu 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
3242c935842SYuanhan Liu 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
3252c935842SYuanhan Liu 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
3262c935842SYuanhan Liu 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
3272c935842SYuanhan Liu 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
3282c935842SYuanhan Liu 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
3292c935842SYuanhan Liu 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
3302c935842SYuanhan Liu 		}
3312c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
3322c935842SYuanhan Liu 		asm volatile("vpxor %ymm2,%ymm2,%ymm2");
3332c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
3342c935842SYuanhan Liu 		asm volatile("vpxor %ymm3,%ymm3,%ymm3");
3352c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
3362c935842SYuanhan Liu 		asm volatile("vpxor %ymm10,%ymm10,%ymm10");
3372c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
3382c935842SYuanhan Liu 		asm volatile("vpxor %ymm11,%ymm11,%ymm11");
3392c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
3402c935842SYuanhan Liu 		asm volatile("vpxor %ymm4,%ymm4,%ymm4");
3412c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
3422c935842SYuanhan Liu 		asm volatile("vpxor %ymm6,%ymm6,%ymm6");
3432c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
3442c935842SYuanhan Liu 		asm volatile("vpxor %ymm12,%ymm12,%ymm12");
3452c935842SYuanhan Liu 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
3462c935842SYuanhan Liu 		asm volatile("vpxor %ymm14,%ymm14,%ymm14");
3472c935842SYuanhan Liu 	}
3482c935842SYuanhan Liu 
3492c935842SYuanhan Liu 	asm volatile("sfence" : : : "memory");
3502c935842SYuanhan Liu 	kernel_fpu_end();
3512c935842SYuanhan Liu }
3522c935842SYuanhan Liu 
raid6_avx24_xor_syndrome(int disks,int start,int stop,size_t bytes,void ** ptrs)353b9bf33a8SGayatri Kammela static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
354b9bf33a8SGayatri Kammela 				     size_t bytes, void **ptrs)
355b9bf33a8SGayatri Kammela {
356b9bf33a8SGayatri Kammela 	u8 **dptr = (u8 **)ptrs;
357b9bf33a8SGayatri Kammela 	u8 *p, *q;
358b9bf33a8SGayatri Kammela 	int d, z, z0;
359b9bf33a8SGayatri Kammela 
360b9bf33a8SGayatri Kammela 	z0 = stop;		/* P/Q right side optimization */
361b9bf33a8SGayatri Kammela 	p = dptr[disks-2];	/* XOR parity */
362b9bf33a8SGayatri Kammela 	q = dptr[disks-1];	/* RS syndrome */
363b9bf33a8SGayatri Kammela 
364b9bf33a8SGayatri Kammela 	kernel_fpu_begin();
365b9bf33a8SGayatri Kammela 
366b9bf33a8SGayatri Kammela 	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
367b9bf33a8SGayatri Kammela 
368b9bf33a8SGayatri Kammela 	for (d = 0 ; d < bytes ; d += 128) {
369b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
370b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
371b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
372b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
373b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
374b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
375b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
376b9bf33a8SGayatri Kammela 		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
377b9bf33a8SGayatri Kammela 		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
378b9bf33a8SGayatri Kammela 		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
379b9bf33a8SGayatri Kammela 		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
380b9bf33a8SGayatri Kammela 		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
381b9bf33a8SGayatri Kammela 		/* P/Q data pages */
382b9bf33a8SGayatri Kammela 		for (z = z0-1 ; z >= start ; z--) {
383b9bf33a8SGayatri Kammela 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
384b9bf33a8SGayatri Kammela 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
385b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
386b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
387b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
388b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
389b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
390b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
391b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
392b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
393b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
394b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
395b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
396b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
397b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
398b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
399b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
400b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
401b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
402b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
403b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
404b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
405b9bf33a8SGayatri Kammela 			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
406b9bf33a8SGayatri Kammela 			asm volatile("vmovdqa %0,%%ymm7"
407b9bf33a8SGayatri Kammela 				     :: "m" (dptr[z][d+32]));
408b9bf33a8SGayatri Kammela 			asm volatile("vmovdqa %0,%%ymm13"
409b9bf33a8SGayatri Kammela 				     :: "m" (dptr[z][d+64]));
410b9bf33a8SGayatri Kammela 			asm volatile("vmovdqa %0,%%ymm15"
411b9bf33a8SGayatri Kammela 				     :: "m" (dptr[z][d+96]));
412b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
413b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
414b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
415b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
416b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
417b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
418b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
419b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
420b9bf33a8SGayatri Kammela 		}
421b9bf33a8SGayatri Kammela 		asm volatile("prefetchnta %0" :: "m" (q[d]));
422b9bf33a8SGayatri Kammela 		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
423b9bf33a8SGayatri Kammela 		/* P/Q left side optimization */
424b9bf33a8SGayatri Kammela 		for (z = start-1 ; z >= 0 ; z--) {
425b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
426b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
427b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
428b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
429b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
430b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
431b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
432b9bf33a8SGayatri Kammela 			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
433b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
434b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
435b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
436b9bf33a8SGayatri Kammela 			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
437b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm5,%ymm5");
438b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm7,%ymm7");
439b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm13,%ymm13");
440b9bf33a8SGayatri Kammela 			asm volatile("vpand %ymm0,%ymm15,%ymm15");
441b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
442b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
443b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
444b9bf33a8SGayatri Kammela 			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
445b9bf33a8SGayatri Kammela 		}
446b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
447b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
448b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
449b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
450b9bf33a8SGayatri Kammela 		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
451b9bf33a8SGayatri Kammela 		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
452b9bf33a8SGayatri Kammela 		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
453b9bf33a8SGayatri Kammela 		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
454b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
455b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
456b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
457b9bf33a8SGayatri Kammela 		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
458b9bf33a8SGayatri Kammela 	}
459b9bf33a8SGayatri Kammela 	asm volatile("sfence" : : : "memory");
460b9bf33a8SGayatri Kammela 	kernel_fpu_end();
461b9bf33a8SGayatri Kammela }
462b9bf33a8SGayatri Kammela 
4632c935842SYuanhan Liu const struct raid6_calls raid6_avx2x4 = {
4642c935842SYuanhan Liu 	raid6_avx24_gen_syndrome,
465b9bf33a8SGayatri Kammela 	raid6_avx24_xor_syndrome,
4662c935842SYuanhan Liu 	raid6_have_avx2,
4672c935842SYuanhan Liu 	"avx2x4",
468*36dacddbSDirk Müller 	.priority = 2		/* Prefer AVX2 over priority 1 (SSE2 and others) */
4692c935842SYuanhan Liu };
470*36dacddbSDirk Müller #endif /* CONFIG_X86_64 */
471