1//
2// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
3//
4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5//
6// This program is free software; you can redistribute it and/or modify
7// it under the terms of the GNU General Public License version 2 as
8// published by the Free Software Foundation.
9//
10
11//
12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13//
14// Copyright (c) 2013, Intel Corporation
15//
16// Authors:
17//     Erdinc Ozturk <erdinc.ozturk@intel.com>
18//     Vinodh Gopal <vinodh.gopal@intel.com>
19//     James Guilford <james.guilford@intel.com>
20//     Tim Chen <tim.c.chen@linux.intel.com>
21//
22// This software is available to you under a choice of one of two
23// licenses.  You may choose to be licensed under the terms of the GNU
24// General Public License (GPL) Version 2, available from the file
25// COPYING in the main directory of this source tree, or the
26// OpenIB.org BSD license below:
27//
28// Redistribution and use in source and binary forms, with or without
29// modification, are permitted provided that the following conditions are
30// met:
31//
32// * Redistributions of source code must retain the above copyright
33//   notice, this list of conditions and the following disclaimer.
34//
35// * Redistributions in binary form must reproduce the above copyright
36//   notice, this list of conditions and the following disclaimer in the
37//   documentation and/or other materials provided with the
38//   distribution.
39//
40// * Neither the name of the Intel Corporation nor the names of its
41//   contributors may be used to endorse or promote products derived from
42//   this software without specific prior written permission.
43//
44//
45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56//
57//       Function API:
58//       UINT16 crc_t10dif_pcl(
59//               UINT16 init_crc, //initial CRC value, 16 bits
60//               const unsigned char *buf, //buffer pointer to calculate CRC on
61//               UINT64 len //buffer length in bytes (64-bit data)
62//       );
63//
64//       Reference paper titled "Fast CRC Computation for Generic
65//	Polynomials Using PCLMULQDQ Instruction"
66//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
67//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68//
69//
70
71#include <linux/linkage.h>
72#include <asm/assembler.h>
73
74#ifdef CONFIG_CPU_ENDIAN_BE8
75#define CPU_LE(code...)
76#else
77#define CPU_LE(code...)		code
78#endif
79
80	.text
81	.fpu		crypto-neon-fp-armv8
82
83	arg1_low32	.req	r0
84	arg2		.req	r1
85	arg3		.req	r2
86
87	qzr		.req	q13
88
89	q0l		.req	d0
90	q0h		.req	d1
91	q1l		.req	d2
92	q1h		.req	d3
93	q2l		.req	d4
94	q2h		.req	d5
95	q3l		.req	d6
96	q3h		.req	d7
97	q4l		.req	d8
98	q4h		.req	d9
99	q5l		.req	d10
100	q5h		.req	d11
101	q6l		.req	d12
102	q6h		.req	d13
103	q7l		.req	d14
104	q7h		.req	d15
105
106ENTRY(crc_t10dif_pmull)
107	vmov.i8		qzr, #0			// init zero register
108
109	// adjust the 16-bit initial_crc value, scale it to 32 bits
110	lsl		arg1_low32, arg1_low32, #16
111
112	// check if smaller than 256
113	cmp		arg3, #256
114
115	// for sizes less than 128, we can't fold 64B at a time...
116	blt		_less_than_128
117
118	// load the initial crc value
119	// crc value does not need to be byte-reflected, but it needs
120	// to be moved to the high part of the register.
121	// because data will be byte-reflected and will align with
122	// initial crc at correct place.
123	vmov		s0, arg1_low32		// initial crc
124	vext.8		q10, qzr, q0, #4
125
126	// receive the initial 64B data, xor the initial crc value
127	vld1.64		{q0-q1}, [arg2, :128]!
128	vld1.64		{q2-q3}, [arg2, :128]!
129	vld1.64		{q4-q5}, [arg2, :128]!
130	vld1.64		{q6-q7}, [arg2, :128]!
131CPU_LE(	vrev64.8	q0, q0			)
132CPU_LE(	vrev64.8	q1, q1			)
133CPU_LE(	vrev64.8	q2, q2			)
134CPU_LE(	vrev64.8	q3, q3			)
135CPU_LE(	vrev64.8	q4, q4			)
136CPU_LE(	vrev64.8	q5, q5			)
137CPU_LE(	vrev64.8	q6, q6			)
138CPU_LE(	vrev64.8	q7, q7			)
139
140	vswp		d0, d1
141	vswp		d2, d3
142	vswp		d4, d5
143	vswp		d6, d7
144	vswp		d8, d9
145	vswp		d10, d11
146	vswp		d12, d13
147	vswp		d14, d15
148
149	// XOR the initial_crc value
150	veor.8		q0, q0, q10
151
152	adr		ip, rk3
153	vld1.64		{q10}, [ip, :128]	// xmm10 has rk3 and rk4
154
155	//
156	// we subtract 256 instead of 128 to save one instruction from the loop
157	//
158	sub		arg3, arg3, #256
159
160	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
161	// buffer. The _fold_64_B_loop will fold 64B at a time
162	// until we have 64+y Bytes of buffer
163
164
165	// fold 64B at a time. This section of the code folds 4 vector
166	// registers in parallel
167_fold_64_B_loop:
168
169	.macro		fold64, reg1, reg2
170	vld1.64		{q11-q12}, [arg2, :128]!
171
172	vmull.p64	q8, \reg1\()h, d21
173	vmull.p64	\reg1, \reg1\()l, d20
174	vmull.p64	q9, \reg2\()h, d21
175	vmull.p64	\reg2, \reg2\()l, d20
176
177CPU_LE(	vrev64.8	q11, q11		)
178CPU_LE(	vrev64.8	q12, q12		)
179	vswp		d22, d23
180	vswp		d24, d25
181
182	veor.8		\reg1, \reg1, q8
183	veor.8		\reg2, \reg2, q9
184	veor.8		\reg1, \reg1, q11
185	veor.8		\reg2, \reg2, q12
186	.endm
187
188	fold64		q0, q1
189	fold64		q2, q3
190	fold64		q4, q5
191	fold64		q6, q7
192
193	subs		arg3, arg3, #128
194
195	// check if there is another 64B in the buffer to be able to fold
196	bge		_fold_64_B_loop
197
198	// at this point, the buffer pointer is pointing at the last y Bytes
199	// of the buffer the 64B of folded data is in 4 of the vector
200	// registers: v0, v1, v2, v3
201
202	// fold the 8 vector registers to 1 vector register with different
203	// constants
204
205	adr		ip, rk9
206	vld1.64		{q10}, [ip, :128]!
207
208	.macro		fold16, reg, rk
209	vmull.p64	q8, \reg\()l, d20
210	vmull.p64	\reg, \reg\()h, d21
211	.ifnb		\rk
212	vld1.64		{q10}, [ip, :128]!
213	.endif
214	veor.8		q7, q7, q8
215	veor.8		q7, q7, \reg
216	.endm
217
218	fold16		q0, rk11
219	fold16		q1, rk13
220	fold16		q2, rk15
221	fold16		q3, rk17
222	fold16		q4, rk19
223	fold16		q5, rk1
224	fold16		q6
225
226	// instead of 64, we add 48 to the loop counter to save 1 instruction
227	// from the loop instead of a cmp instruction, we use the negative
228	// flag with the jl instruction
229	adds		arg3, arg3, #(128-16)
230	blt		_final_reduction_for_128
231
232	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
233	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
234	// continue folding 16B at a time
235
236_16B_reduction_loop:
237	vmull.p64	q8, d14, d20
238	vmull.p64	q7, d15, d21
239	veor.8		q7, q7, q8
240
241	vld1.64		{q0}, [arg2, :128]!
242CPU_LE(	vrev64.8	q0, q0		)
243	vswp		d0, d1
244	veor.8		q7, q7, q0
245	subs		arg3, arg3, #16
246
247	// instead of a cmp instruction, we utilize the flags with the
248	// jge instruction equivalent of: cmp arg3, 16-16
249	// check if there is any more 16B in the buffer to be able to fold
250	bge		_16B_reduction_loop
251
252	// now we have 16+z bytes left to reduce, where 0<= z < 16.
253	// first, we reduce the data in the xmm7 register
254
255_final_reduction_for_128:
256	// check if any more data to fold. If not, compute the CRC of
257	// the final 128 bits
258	adds		arg3, arg3, #16
259	beq		_128_done
260
261	// here we are getting data that is less than 16 bytes.
262	// since we know that there was data before the pointer, we can
263	// offset the input pointer before the actual point, to receive
264	// exactly 16 bytes. after that the registers need to be adjusted.
265_get_last_two_regs:
266	add		arg2, arg2, arg3
267	sub		arg2, arg2, #16
268	vld1.64		{q1}, [arg2]
269CPU_LE(	vrev64.8	q1, q1			)
270	vswp		d2, d3
271
272	// get rid of the extra data that was loaded before
273	// load the shift constant
274	adr		ip, tbl_shf_table + 16
275	sub		ip, ip, arg3
276	vld1.8		{q0}, [ip]
277
278	// shift v2 to the left by arg3 bytes
279	vtbl.8		d4, {d14-d15}, d0
280	vtbl.8		d5, {d14-d15}, d1
281
282	// shift v7 to the right by 16-arg3 bytes
283	vmov.i8		q9, #0x80
284	veor.8		q0, q0, q9
285	vtbl.8		d18, {d14-d15}, d0
286	vtbl.8		d19, {d14-d15}, d1
287
288	// blend
289	vshr.s8		q0, q0, #7		// convert to 8-bit mask
290	vbsl.8		q0, q2, q1
291
292	// fold 16 Bytes
293	vmull.p64	q8, d18, d20
294	vmull.p64	q7, d19, d21
295	veor.8		q7, q7, q8
296	veor.8		q7, q7, q0
297
298_128_done:
299	// compute crc of a 128-bit value
300	vldr		d20, rk5
301	vldr		d21, rk6		// rk5 and rk6 in xmm10
302
303	// 64b fold
304	vext.8		q0, qzr, q7, #8
305	vmull.p64	q7, d15, d20
306	veor.8		q7, q7, q0
307
308	// 32b fold
309	vext.8		q0, q7, qzr, #12
310	vmov		s31, s3
311	vmull.p64	q0, d0, d21
312	veor.8		q7, q0, q7
313
314	// barrett reduction
315_barrett:
316	vldr		d20, rk7
317	vldr		d21, rk8
318
319	vmull.p64	q0, d15, d20
320	vext.8		q0, qzr, q0, #12
321	vmull.p64	q0, d1, d21
322	vext.8		q0, qzr, q0, #12
323	veor.8		q7, q7, q0
324	vmov		r0, s29
325
326_cleanup:
327	// scale the result back to 16 bits
328	lsr		r0, r0, #16
329	bx		lr
330
331_less_than_128:
332	teq		arg3, #0
333	beq		_cleanup
334
335	vmov.i8		q0, #0
336	vmov		s3, arg1_low32		// get the initial crc value
337
338	vld1.64		{q7}, [arg2, :128]!
339CPU_LE(	vrev64.8	q7, q7		)
340	vswp		d14, d15
341	veor.8		q7, q7, q0
342
343	cmp		arg3, #16
344	beq		_128_done		// exactly 16 left
345	blt		_less_than_16_left
346
347	// now if there is, load the constants
348	vldr		d20, rk1
349	vldr		d21, rk2		// rk1 and rk2 in xmm10
350
351	// check if there is enough buffer to be able to fold 16B at a time
352	subs		arg3, arg3, #32
353	addlt		arg3, arg3, #16
354	blt		_get_last_two_regs
355	b		_16B_reduction_loop
356
357_less_than_16_left:
358	// shl r9, 4
359	adr		ip, tbl_shf_table + 16
360	sub		ip, ip, arg3
361	vld1.8		{q0}, [ip]
362	vmov.i8		q9, #0x80
363	veor.8		q0, q0, q9
364	vtbl.8		d18, {d14-d15}, d0
365	vtbl.8		d15, {d14-d15}, d1
366	vmov		d14, d18
367	b		_128_done
368ENDPROC(crc_t10dif_pmull)
369
370// precomputed constants
371// these constants are precomputed from the poly:
372// 0x8bb70000 (0x8bb7 scaled to 32 bits)
373	.align		4
374// Q = 0x18BB70000
375// rk1 = 2^(32*3) mod Q << 32
376// rk2 = 2^(32*5) mod Q << 32
377// rk3 = 2^(32*15) mod Q << 32
378// rk4 = 2^(32*17) mod Q << 32
379// rk5 = 2^(32*3) mod Q << 32
380// rk6 = 2^(32*2) mod Q << 32
381// rk7 = floor(2^64/Q)
382// rk8 = Q
383
384rk3:	.quad		0x9d9d000000000000
385rk4:	.quad		0x7cf5000000000000
386rk5:	.quad		0x2d56000000000000
387rk6:	.quad		0x1368000000000000
388rk7:	.quad		0x00000001f65a57f8
389rk8:	.quad		0x000000018bb70000
390rk9:	.quad		0xceae000000000000
391rk10:	.quad		0xbfd6000000000000
392rk11:	.quad		0x1e16000000000000
393rk12:	.quad		0x713c000000000000
394rk13:	.quad		0xf7f9000000000000
395rk14:	.quad		0x80a6000000000000
396rk15:	.quad		0x044c000000000000
397rk16:	.quad		0xe658000000000000
398rk17:	.quad		0xad18000000000000
399rk18:	.quad		0xa497000000000000
400rk19:	.quad		0x6ee3000000000000
401rk20:	.quad		0xe7b5000000000000
402rk1:	.quad		0x2d56000000000000
403rk2:	.quad		0x06df000000000000
404
405tbl_shf_table:
406// use these values for shift constants for the tbl/tbx instruction
407// different alignments result in values as shown:
408//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
409//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
410//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
411//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
412//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
413//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
414//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
415//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
416//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
417//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
418//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
419//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
420//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
421//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
422//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
423
424	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
425	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
426	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
427	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
428