Lines Matching +full:0 +full:- +full:128
2 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
14 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
62 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
75 .arch armv8-a
76 .fpu crypto-neon-fp-armv8
118 vld1.64 {q11-q12}, [buf]!
141 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
159 // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
165 // Load the first 128 data bytes. Byte swapping is necessary to make
167 vld1.64 {q0-q1}, [buf]!
168 vld1.64 {q2-q3}, [buf]!
169 vld1.64 {q4-q5}, [buf]!
170 vld1.64 {q6-q7}, [buf]!
189 vmov.i8 q8h, #0
193 // Load the constants for folding across 128 bytes.
194 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
196 // Subtract 128 for the 128 data bytes just consumed. Subtract another
197 // 128 to simplify the termination condition of the following loop.
200 // While >= 128 data bytes remain (not counting q0-q7), fold the 128
201 // bytes q0-q7 into them, storing the result back into q0-q7.
207 subs len, len, #128
210 // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
213 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
224 // Add 128 to get the correct number of data bytes remaining in 0...127
225 // (not counting q7), following the previous extra subtraction by 128.
228 adds len, len, #(128-16)
245 // Add 16 to get the correct number of data bytes remaining in 0...15
264 // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
268 vtbl.8 q1l, {q7l-q7h}, q2l
269 vtbl.8 q1h, {q7l-q7h}, q2h
271 // q3 = first chunk: q7 right-shifted by '16-len' bytes.
272 vmov.i8 q3, #0x80
274 vtbl.8 q3l, {q7l-q7h}, q2l
275 vtbl.8 q3h, {q7l-q7h}, q2h
277 // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
280 // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
281 // then '16-len' bytes from q1 (high-order bytes).
291 // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
294 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
297 // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
298 // whose low 48 bits are 0.
302 // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
303 // value congruent to x^64 * M(x) and whose low 48 bits are 0.
304 vmov.i8 q1, #0
311 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
321 vmov.u16 r0, q0l[0]
335 vmov.i8 q0h, #0
339 // Load the fold-across-16-bytes constants.
340 vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
353 // Fold constants precomputed from the polynomial 0x18bb7
354 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
356 .quad 0x0000000000006123 // x^(8*128) mod G(x)
357 .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
359 .quad 0x0000000000001069 // x^(4*128) mod G(x)
360 .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
362 .quad 0x000000000000857d // x^(2*128) mod G(x)
363 .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
365 .quad 0x000000000000a010 // x^(1*128) mod G(x)
366 .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
368 .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
369 .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
371 .quad 0x0000000000018bb7 // G(x)
372 .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
374 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
375 // len] is the index vector to shift left by 'len' bytes, and is also {0x80,
376 // ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
378 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
379 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
380 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
381 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0