arm/crypto/crct10dif-ce-core.S

2 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
14 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
62 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
75 	.arch		armv8-a
76 	.fpu		crypto-neon-fp-armv8
118 	vld1.64		{q11-q12}, [buf]!
141 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
159 	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
165 	// Load the first 128 data bytes.  Byte swapping is necessary to make
167 	vld1.64		{q0-q1}, [buf]!
168 	vld1.64		{q2-q3}, [buf]!
169 	vld1.64		{q4-q5}, [buf]!
170 	vld1.64		{q6-q7}, [buf]!
189 	vmov.i8		q8h, #0
193 	// Load the constants for folding across 128 bytes.
194 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
196 	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
197 	// 128 to simplify the termination condition of the following loop.
200 	// While >= 128 data bytes remain (not counting q0-q7), fold the 128
201 	// bytes q0-q7 into them, storing the result back into q0-q7.
207 	subs		len, len, #128
210 	// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
213 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
224 	// Add 128 to get the correct number of data bytes remaining in 0...127
225 	// (not counting q7), following the previous extra subtraction by 128.
228 	adds		len, len, #(128-16)
245 	// Add 16 to get the correct number of data bytes remaining in 0...15
264 	// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
268 	vtbl.8		q1l, {q7l-q7h}, q2l
269 	vtbl.8		q1h, {q7l-q7h}, q2h
271 	// q3 = first chunk: q7 right-shifted by '16-len' bytes.
272 	vmov.i8		q3, #0x80
274 	vtbl.8		q3l, {q7l-q7h}, q2l
275 	vtbl.8		q3h, {q7l-q7h}, q2h
277 	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
280 	// q2 = second chunk: 'len' bytes from q0 (low-order bytes),
281 	// then '16-len' bytes from q1 (high-order bytes).
291 	// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
294 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
297 	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
298 	// whose low 48 bits are 0.
302 	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
303 	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
304 	vmov.i8		q1, #0
311 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]
321 	vmov.u16	r0, q0l[0]
335 	vmov.i8		q0h, #0
339 	// Load the fold-across-16-bytes constants.
340 	vld1.64		{FOLD_CONSTS}, [fold_consts_ptr, :128]!
353 // Fold constants precomputed from the polynomial 0x18bb7
354 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
356 	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
357 	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
359 	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
360 	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
362 	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
363 	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
365 	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
366 	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
368 	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
369 	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
371 	.quad		0x0000000000018bb7	// G(x)
372 	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
374 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
375 // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
376 // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
378 	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
379 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
380 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
381 	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0