1//
2// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
3//
4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5//
6// This program is free software; you can redistribute it and/or modify
7// it under the terms of the GNU General Public License version 2 as
8// published by the Free Software Foundation.
9//
10
11//
12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13//
14// Copyright (c) 2013, Intel Corporation
15//
16// Authors:
17//     Erdinc Ozturk <erdinc.ozturk@intel.com>
18//     Vinodh Gopal <vinodh.gopal@intel.com>
19//     James Guilford <james.guilford@intel.com>
20//     Tim Chen <tim.c.chen@linux.intel.com>
21//
22// This software is available to you under a choice of one of two
23// licenses.  You may choose to be licensed under the terms of the GNU
24// General Public License (GPL) Version 2, available from the file
25// COPYING in the main directory of this source tree, or the
26// OpenIB.org BSD license below:
27//
28// Redistribution and use in source and binary forms, with or without
29// modification, are permitted provided that the following conditions are
30// met:
31//
32// * Redistributions of source code must retain the above copyright
33//   notice, this list of conditions and the following disclaimer.
34//
35// * Redistributions in binary form must reproduce the above copyright
36//   notice, this list of conditions and the following disclaimer in the
37//   documentation and/or other materials provided with the
38//   distribution.
39//
40// * Neither the name of the Intel Corporation nor the names of its
41//   contributors may be used to endorse or promote products derived from
42//   this software without specific prior written permission.
43//
44//
45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56//
57//       Function API:
58//       UINT16 crc_t10dif_pcl(
59//               UINT16 init_crc, //initial CRC value, 16 bits
60//               const unsigned char *buf, //buffer pointer to calculate CRC on
61//               UINT64 len //buffer length in bytes (64-bit data)
62//       );
63//
64//       Reference paper titled "Fast CRC Computation for Generic
65//	Polynomials Using PCLMULQDQ Instruction"
66//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
67//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68//
69//
70
71#include <linux/linkage.h>
72#include <asm/assembler.h>
73
74	.text
75	.cpu		generic+crypto
76
77	arg1_low32	.req	w19
78	arg2		.req	x20
79	arg3		.req	x21
80
81	vzr		.req	v13
82
83ENTRY(crc_t10dif_pmull)
84	frame_push	3, 128
85
86	mov		arg1_low32, w0
87	mov		arg2, x1
88	mov		arg3, x2
89
90	movi		vzr.16b, #0		// init zero register
91
92	// adjust the 16-bit initial_crc value, scale it to 32 bits
93	lsl		arg1_low32, arg1_low32, #16
94
95	// check if smaller than 256
96	cmp		arg3, #256
97
98	// for sizes less than 128, we can't fold 64B at a time...
99	b.lt		_less_than_128
100
101	// load the initial crc value
102	// crc value does not need to be byte-reflected, but it needs
103	// to be moved to the high part of the register.
104	// because data will be byte-reflected and will align with
105	// initial crc at correct place.
106	movi		v10.16b, #0
107	mov		v10.s[3], arg1_low32		// initial crc
108
109	// receive the initial 64B data, xor the initial crc value
110	ldp		q0, q1, [arg2]
111	ldp		q2, q3, [arg2, #0x20]
112	ldp		q4, q5, [arg2, #0x40]
113	ldp		q6, q7, [arg2, #0x60]
114	add		arg2, arg2, #0x80
115
116CPU_LE(	rev64		v0.16b, v0.16b			)
117CPU_LE(	rev64		v1.16b, v1.16b			)
118CPU_LE(	rev64		v2.16b, v2.16b			)
119CPU_LE(	rev64		v3.16b, v3.16b			)
120CPU_LE(	rev64		v4.16b, v4.16b			)
121CPU_LE(	rev64		v5.16b, v5.16b			)
122CPU_LE(	rev64		v6.16b, v6.16b			)
123CPU_LE(	rev64		v7.16b, v7.16b			)
124
125CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
126CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
127CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
128CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
129CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
130CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
131CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
132CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
133
134	// XOR the initial_crc value
135	eor		v0.16b, v0.16b, v10.16b
136
137	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
138					// type of pmull instruction
139					// will determine which constant to use
140
141	//
142	// we subtract 256 instead of 128 to save one instruction from the loop
143	//
144	sub		arg3, arg3, #256
145
146	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
147	// buffer. The _fold_64_B_loop will fold 64B at a time
148	// until we have 64+y Bytes of buffer
149
150
151	// fold 64B at a time. This section of the code folds 4 vector
152	// registers in parallel
153_fold_64_B_loop:
154
155	.macro		fold64, reg1, reg2
156	ldp		q11, q12, [arg2], #0x20
157
158	pmull2		v8.1q, \reg1\().2d, v10.2d
159	pmull		\reg1\().1q, \reg1\().1d, v10.1d
160
161CPU_LE(	rev64		v11.16b, v11.16b		)
162CPU_LE(	rev64		v12.16b, v12.16b		)
163
164	pmull2		v9.1q, \reg2\().2d, v10.2d
165	pmull		\reg2\().1q, \reg2\().1d, v10.1d
166
167CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
168CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
169
170	eor		\reg1\().16b, \reg1\().16b, v8.16b
171	eor		\reg2\().16b, \reg2\().16b, v9.16b
172	eor		\reg1\().16b, \reg1\().16b, v11.16b
173	eor		\reg2\().16b, \reg2\().16b, v12.16b
174	.endm
175
176	fold64		v0, v1
177	fold64		v2, v3
178	fold64		v4, v5
179	fold64		v6, v7
180
181	subs		arg3, arg3, #128
182
183	// check if there is another 64B in the buffer to be able to fold
184	b.lt		_fold_64_B_end
185
186	if_will_cond_yield_neon
187	stp		q0, q1, [sp, #.Lframe_local_offset]
188	stp		q2, q3, [sp, #.Lframe_local_offset + 32]
189	stp		q4, q5, [sp, #.Lframe_local_offset + 64]
190	stp		q6, q7, [sp, #.Lframe_local_offset + 96]
191	do_cond_yield_neon
192	ldp		q0, q1, [sp, #.Lframe_local_offset]
193	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
194	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
195	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
196	ldr_l		q10, rk3, x8
197	movi		vzr.16b, #0		// init zero register
198	endif_yield_neon
199
200	b		_fold_64_B_loop
201
202_fold_64_B_end:
203	// at this point, the buffer pointer is pointing at the last y Bytes
204	// of the buffer the 64B of folded data is in 4 of the vector
205	// registers: v0, v1, v2, v3
206
207	// fold the 8 vector registers to 1 vector register with different
208	// constants
209
210	ldr_l		q10, rk9, x8
211
212	.macro		fold16, reg, rk
213	pmull		v8.1q, \reg\().1d, v10.1d
214	pmull2		\reg\().1q, \reg\().2d, v10.2d
215	.ifnb		\rk
216	ldr_l		q10, \rk, x8
217	.endif
218	eor		v7.16b, v7.16b, v8.16b
219	eor		v7.16b, v7.16b, \reg\().16b
220	.endm
221
222	fold16		v0, rk11
223	fold16		v1, rk13
224	fold16		v2, rk15
225	fold16		v3, rk17
226	fold16		v4, rk19
227	fold16		v5, rk1
228	fold16		v6
229
230	// instead of 64, we add 48 to the loop counter to save 1 instruction
231	// from the loop instead of a cmp instruction, we use the negative
232	// flag with the jl instruction
233	adds		arg3, arg3, #(128-16)
234	b.lt		_final_reduction_for_128
235
236	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
237	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
238	// continue folding 16B at a time
239
240_16B_reduction_loop:
241	pmull		v8.1q, v7.1d, v10.1d
242	pmull2		v7.1q, v7.2d, v10.2d
243	eor		v7.16b, v7.16b, v8.16b
244
245	ldr		q0, [arg2], #16
246CPU_LE(	rev64		v0.16b, v0.16b			)
247CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
248	eor		v7.16b, v7.16b, v0.16b
249	subs		arg3, arg3, #16
250
251	// instead of a cmp instruction, we utilize the flags with the
252	// jge instruction equivalent of: cmp arg3, 16-16
253	// check if there is any more 16B in the buffer to be able to fold
254	b.ge		_16B_reduction_loop
255
256	// now we have 16+z bytes left to reduce, where 0<= z < 16.
257	// first, we reduce the data in the xmm7 register
258
259_final_reduction_for_128:
260	// check if any more data to fold. If not, compute the CRC of
261	// the final 128 bits
262	adds		arg3, arg3, #16
263	b.eq		_128_done
264
265	// here we are getting data that is less than 16 bytes.
266	// since we know that there was data before the pointer, we can
267	// offset the input pointer before the actual point, to receive
268	// exactly 16 bytes. after that the registers need to be adjusted.
269_get_last_two_regs:
270	add		arg2, arg2, arg3
271	ldr		q1, [arg2, #-16]
272CPU_LE(	rev64		v1.16b, v1.16b			)
273CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
274
275	// get rid of the extra data that was loaded before
276	// load the shift constant
277	adr_l		x4, tbl_shf_table + 16
278	sub		x4, x4, arg3
279	ld1		{v0.16b}, [x4]
280
281	// shift v2 to the left by arg3 bytes
282	tbl		v2.16b, {v7.16b}, v0.16b
283
284	// shift v7 to the right by 16-arg3 bytes
285	movi		v9.16b, #0x80
286	eor		v0.16b, v0.16b, v9.16b
287	tbl		v7.16b, {v7.16b}, v0.16b
288
289	// blend
290	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
291	bsl		v0.16b, v2.16b, v1.16b
292
293	// fold 16 Bytes
294	pmull		v8.1q, v7.1d, v10.1d
295	pmull2		v7.1q, v7.2d, v10.2d
296	eor		v7.16b, v7.16b, v8.16b
297	eor		v7.16b, v7.16b, v0.16b
298
299_128_done:
300	// compute crc of a 128-bit value
301	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
302
303	// 64b fold
304	ext		v0.16b, vzr.16b, v7.16b, #8
305	mov		v7.d[0], v7.d[1]
306	pmull		v7.1q, v7.1d, v10.1d
307	eor		v7.16b, v7.16b, v0.16b
308
309	// 32b fold
310	ext		v0.16b, v7.16b, vzr.16b, #4
311	mov		v7.s[3], vzr.s[0]
312	pmull2		v0.1q, v0.2d, v10.2d
313	eor		v7.16b, v7.16b, v0.16b
314
315	// barrett reduction
316_barrett:
317	ldr_l		q10, rk7, x8
318	mov		v0.d[0], v7.d[1]
319
320	pmull		v0.1q, v0.1d, v10.1d
321	ext		v0.16b, vzr.16b, v0.16b, #12
322	pmull2		v0.1q, v0.2d, v10.2d
323	ext		v0.16b, vzr.16b, v0.16b, #12
324	eor		v7.16b, v7.16b, v0.16b
325	mov		w0, v7.s[1]
326
327_cleanup:
328	// scale the result back to 16 bits
329	lsr		x0, x0, #16
330	frame_pop
331	ret
332
333_less_than_128:
334	cbz		arg3, _cleanup
335
336	movi		v0.16b, #0
337	mov		v0.s[3], arg1_low32	// get the initial crc value
338
339	ldr		q7, [arg2], #0x10
340CPU_LE(	rev64		v7.16b, v7.16b			)
341CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
342	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
343
344	cmp		arg3, #16
345	b.eq		_128_done		// exactly 16 left
346	b.lt		_less_than_16_left
347
348	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
349
350	// update the counter. subtract 32 instead of 16 to save one
351	// instruction from the loop
352	subs		arg3, arg3, #32
353	b.ge		_16B_reduction_loop
354
355	add		arg3, arg3, #16
356	b		_get_last_two_regs
357
358_less_than_16_left:
359	// shl r9, 4
360	adr_l		x0, tbl_shf_table + 16
361	sub		x0, x0, arg3
362	ld1		{v0.16b}, [x0]
363	movi		v9.16b, #0x80
364	eor		v0.16b, v0.16b, v9.16b
365	tbl		v7.16b, {v7.16b}, v0.16b
366	b		_128_done
367ENDPROC(crc_t10dif_pmull)
368
369// precomputed constants
370// these constants are precomputed from the poly:
371// 0x8bb70000 (0x8bb7 scaled to 32 bits)
372	.section	".rodata", "a"
373	.align		4
374// Q = 0x18BB70000
375// rk1 = 2^(32*3) mod Q << 32
376// rk2 = 2^(32*5) mod Q << 32
377// rk3 = 2^(32*15) mod Q << 32
378// rk4 = 2^(32*17) mod Q << 32
379// rk5 = 2^(32*3) mod Q << 32
380// rk6 = 2^(32*2) mod Q << 32
381// rk7 = floor(2^64/Q)
382// rk8 = Q
383
384rk1:	.octa		0x06df0000000000002d56000000000000
385rk3:	.octa		0x7cf50000000000009d9d000000000000
386rk5:	.octa		0x13680000000000002d56000000000000
387rk7:	.octa		0x000000018bb7000000000001f65a57f8
388rk9:	.octa		0xbfd6000000000000ceae000000000000
389rk11:	.octa		0x713c0000000000001e16000000000000
390rk13:	.octa		0x80a6000000000000f7f9000000000000
391rk15:	.octa		0xe658000000000000044c000000000000
392rk17:	.octa		0xa497000000000000ad18000000000000
393rk19:	.octa		0xe7b50000000000006ee3000000000000
394
395tbl_shf_table:
396// use these values for shift constants for the tbl/tbx instruction
397// different alignments result in values as shown:
398//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
399//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
400//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
401//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
402//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
403//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
404//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
405//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
406//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
407//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
408//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
409//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
410//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
411//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
412//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
413
414	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
415	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
416	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
417	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
418