1//
2// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
3//
4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5//
6// This program is free software; you can redistribute it and/or modify
7// it under the terms of the GNU General Public License version 2 as
8// published by the Free Software Foundation.
9//
10
11//
12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13//
14// Copyright (c) 2013, Intel Corporation
15//
16// Authors:
17//     Erdinc Ozturk <erdinc.ozturk@intel.com>
18//     Vinodh Gopal <vinodh.gopal@intel.com>
19//     James Guilford <james.guilford@intel.com>
20//     Tim Chen <tim.c.chen@linux.intel.com>
21//
22// This software is available to you under a choice of one of two
23// licenses.  You may choose to be licensed under the terms of the GNU
24// General Public License (GPL) Version 2, available from the file
25// COPYING in the main directory of this source tree, or the
26// OpenIB.org BSD license below:
27//
28// Redistribution and use in source and binary forms, with or without
29// modification, are permitted provided that the following conditions are
30// met:
31//
32// * Redistributions of source code must retain the above copyright
33//   notice, this list of conditions and the following disclaimer.
34//
35// * Redistributions in binary form must reproduce the above copyright
36//   notice, this list of conditions and the following disclaimer in the
37//   documentation and/or other materials provided with the
38//   distribution.
39//
40// * Neither the name of the Intel Corporation nor the names of its
41//   contributors may be used to endorse or promote products derived from
42//   this software without specific prior written permission.
43//
44//
45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56//
57//       Function API:
58//       UINT16 crc_t10dif_pcl(
59//               UINT16 init_crc, //initial CRC value, 16 bits
60//               const unsigned char *buf, //buffer pointer to calculate CRC on
61//               UINT64 len //buffer length in bytes (64-bit data)
62//       );
63//
64//       Reference paper titled "Fast CRC Computation for Generic
65//	Polynomials Using PCLMULQDQ Instruction"
66//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
67//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68//
69//
70
71#include <linux/linkage.h>
72#include <asm/assembler.h>
73
74	.text
75	.cpu		generic+crypto
76
77	arg1_low32	.req	w19
78	arg2		.req	x20
79	arg3		.req	x21
80
81	vzr		.req	v13
82
83	ad		.req	v14
84	bd		.req	v10
85
86	k00_16		.req	v15
87	k32_48		.req	v16
88
89	t3		.req	v17
90	t4		.req	v18
91	t5		.req	v19
92	t6		.req	v20
93	t7		.req	v21
94	t8		.req	v22
95	t9		.req	v23
96
97	perm1		.req	v24
98	perm2		.req	v25
99	perm3		.req	v26
100	perm4		.req	v27
101
102	bd1		.req	v28
103	bd2		.req	v29
104	bd3		.req	v30
105	bd4		.req	v31
106
107	.macro		__pmull_init_p64
108	.endm
109
110	.macro		__pmull_pre_p64, bd
111	.endm
112
113	.macro		__pmull_init_p8
114	// k00_16 := 0x0000000000000000_000000000000ffff
115	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
116	movi		k32_48.2d, #0xffffffff
117	mov		k32_48.h[2], k32_48.h[0]
118	ushr		k00_16.2d, k32_48.2d, #32
119
120	// prepare the permutation vectors
121	mov_q		x5, 0x080f0e0d0c0b0a09
122	movi		perm4.8b, #8
123	dup		perm1.2d, x5
124	eor		perm1.16b, perm1.16b, perm4.16b
125	ushr		perm2.2d, perm1.2d, #8
126	ushr		perm3.2d, perm1.2d, #16
127	ushr		perm4.2d, perm1.2d, #24
128	sli		perm2.2d, perm1.2d, #56
129	sli		perm3.2d, perm1.2d, #48
130	sli		perm4.2d, perm1.2d, #40
131	.endm
132
133	.macro		__pmull_pre_p8, bd
134	tbl		bd1.16b, {\bd\().16b}, perm1.16b
135	tbl		bd2.16b, {\bd\().16b}, perm2.16b
136	tbl		bd3.16b, {\bd\().16b}, perm3.16b
137	tbl		bd4.16b, {\bd\().16b}, perm4.16b
138	.endm
139
140__pmull_p8_core:
141.L__pmull_p8_core:
142	ext		t4.8b, ad.8b, ad.8b, #1			// A1
143	ext		t5.8b, ad.8b, ad.8b, #2			// A2
144	ext		t6.8b, ad.8b, ad.8b, #3			// A3
145
146	pmull		t4.8h, t4.8b, bd.8b			// F = A1*B
147	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
148	pmull		t5.8h, t5.8b, bd.8b			// H = A2*B
149	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
150	pmull		t6.8h, t6.8b, bd.8b			// J = A3*B
151	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
152	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
153	b		0f
154
155.L__pmull_p8_core2:
156	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
157	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
158	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
159
160	pmull2		t4.8h, t4.16b, bd.16b			// F = A1*B
161	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
162	pmull2		t5.8h, t5.16b, bd.16b			// H = A2*B
163	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
164	pmull2		t6.8h, t6.16b, bd.16b			// J = A3*B
165	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
166	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
167
1680:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
169	eor		t5.16b, t5.16b, t7.16b			// M = G + H
170	eor		t6.16b, t6.16b, t9.16b			// N = I + J
171
172	uzp1		t8.2d, t4.2d, t5.2d
173	uzp2		t4.2d, t4.2d, t5.2d
174	uzp1		t7.2d, t6.2d, t3.2d
175	uzp2		t6.2d, t6.2d, t3.2d
176
177	// t4 = (L) (P0 + P1) << 8
178	// t5 = (M) (P2 + P3) << 16
179	eor		t8.16b, t8.16b, t4.16b
180	and		t4.16b, t4.16b, k32_48.16b
181
182	// t6 = (N) (P4 + P5) << 24
183	// t7 = (K) (P6 + P7) << 32
184	eor		t7.16b, t7.16b, t6.16b
185	and		t6.16b, t6.16b, k00_16.16b
186
187	eor		t8.16b, t8.16b, t4.16b
188	eor		t7.16b, t7.16b, t6.16b
189
190	zip2		t5.2d, t8.2d, t4.2d
191	zip1		t4.2d, t8.2d, t4.2d
192	zip2		t3.2d, t7.2d, t6.2d
193	zip1		t6.2d, t7.2d, t6.2d
194
195	ext		t4.16b, t4.16b, t4.16b, #15
196	ext		t5.16b, t5.16b, t5.16b, #14
197	ext		t6.16b, t6.16b, t6.16b, #13
198	ext		t3.16b, t3.16b, t3.16b, #12
199
200	eor		t4.16b, t4.16b, t5.16b
201	eor		t6.16b, t6.16b, t3.16b
202	ret
203ENDPROC(__pmull_p8_core)
204
205	.macro		__pmull_p8, rq, ad, bd, i
206	.ifnc		\bd, v10
207	.err
208	.endif
209	mov		ad.16b, \ad\().16b
210	.ifb		\i
211	pmull		\rq\().8h, \ad\().8b, bd.8b		// D = A*B
212	.else
213	pmull2		\rq\().8h, \ad\().16b, bd.16b		// D = A*B
214	.endif
215
216	bl		.L__pmull_p8_core\i
217
218	eor		\rq\().16b, \rq\().16b, t4.16b
219	eor		\rq\().16b, \rq\().16b, t6.16b
220	.endm
221
222	.macro		fold64, p, reg1, reg2
223	ldp		q11, q12, [arg2], #0x20
224
225	__pmull_\p	v8, \reg1, v10, 2
226	__pmull_\p	\reg1, \reg1, v10
227
228CPU_LE(	rev64		v11.16b, v11.16b		)
229CPU_LE(	rev64		v12.16b, v12.16b		)
230
231	__pmull_\p	v9, \reg2, v10, 2
232	__pmull_\p	\reg2, \reg2, v10
233
234CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
235CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
236
237	eor		\reg1\().16b, \reg1\().16b, v8.16b
238	eor		\reg2\().16b, \reg2\().16b, v9.16b
239	eor		\reg1\().16b, \reg1\().16b, v11.16b
240	eor		\reg2\().16b, \reg2\().16b, v12.16b
241	.endm
242
243	.macro		fold16, p, reg, rk
244	__pmull_\p	v8, \reg, v10
245	__pmull_\p	\reg, \reg, v10, 2
246	.ifnb		\rk
247	ldr_l		q10, \rk, x8
248	__pmull_pre_\p	v10
249	.endif
250	eor		v7.16b, v7.16b, v8.16b
251	eor		v7.16b, v7.16b, \reg\().16b
252	.endm
253
254	.macro		__pmull_p64, rd, rn, rm, n
255	.ifb		\n
256	pmull		\rd\().1q, \rn\().1d, \rm\().1d
257	.else
258	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
259	.endif
260	.endm
261
262	.macro		crc_t10dif_pmull, p
263	frame_push	3, 128
264
265	mov		arg1_low32, w0
266	mov		arg2, x1
267	mov		arg3, x2
268
269	movi		vzr.16b, #0		// init zero register
270
271	__pmull_init_\p
272
273	// adjust the 16-bit initial_crc value, scale it to 32 bits
274	lsl		arg1_low32, arg1_low32, #16
275
276	// check if smaller than 256
277	cmp		arg3, #256
278
279	// for sizes less than 128, we can't fold 64B at a time...
280	b.lt		.L_less_than_128_\@
281
282	// load the initial crc value
283	// crc value does not need to be byte-reflected, but it needs
284	// to be moved to the high part of the register.
285	// because data will be byte-reflected and will align with
286	// initial crc at correct place.
287	movi		v10.16b, #0
288	mov		v10.s[3], arg1_low32		// initial crc
289
290	// receive the initial 64B data, xor the initial crc value
291	ldp		q0, q1, [arg2]
292	ldp		q2, q3, [arg2, #0x20]
293	ldp		q4, q5, [arg2, #0x40]
294	ldp		q6, q7, [arg2, #0x60]
295	add		arg2, arg2, #0x80
296
297CPU_LE(	rev64		v0.16b, v0.16b			)
298CPU_LE(	rev64		v1.16b, v1.16b			)
299CPU_LE(	rev64		v2.16b, v2.16b			)
300CPU_LE(	rev64		v3.16b, v3.16b			)
301CPU_LE(	rev64		v4.16b, v4.16b			)
302CPU_LE(	rev64		v5.16b, v5.16b			)
303CPU_LE(	rev64		v6.16b, v6.16b			)
304CPU_LE(	rev64		v7.16b, v7.16b			)
305
306CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
307CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
308CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
309CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
310CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
311CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
312CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
313CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
314
315	// XOR the initial_crc value
316	eor		v0.16b, v0.16b, v10.16b
317
318	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
319					// type of pmull instruction
320					// will determine which constant to use
321	__pmull_pre_\p	v10
322
323	//
324	// we subtract 256 instead of 128 to save one instruction from the loop
325	//
326	sub		arg3, arg3, #256
327
328	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
329	// buffer. The _fold_64_B_loop will fold 64B at a time
330	// until we have 64+y Bytes of buffer
331
332	// fold 64B at a time. This section of the code folds 4 vector
333	// registers in parallel
334.L_fold_64_B_loop_\@:
335
336	fold64		\p, v0, v1
337	fold64		\p, v2, v3
338	fold64		\p, v4, v5
339	fold64		\p, v6, v7
340
341	subs		arg3, arg3, #128
342
343	// check if there is another 64B in the buffer to be able to fold
344	b.lt		.L_fold_64_B_end_\@
345
346	if_will_cond_yield_neon
347	stp		q0, q1, [sp, #.Lframe_local_offset]
348	stp		q2, q3, [sp, #.Lframe_local_offset + 32]
349	stp		q4, q5, [sp, #.Lframe_local_offset + 64]
350	stp		q6, q7, [sp, #.Lframe_local_offset + 96]
351	do_cond_yield_neon
352	ldp		q0, q1, [sp, #.Lframe_local_offset]
353	ldp		q2, q3, [sp, #.Lframe_local_offset + 32]
354	ldp		q4, q5, [sp, #.Lframe_local_offset + 64]
355	ldp		q6, q7, [sp, #.Lframe_local_offset + 96]
356	ldr_l		q10, rk3, x8
357	movi		vzr.16b, #0		// init zero register
358	__pmull_init_\p
359	__pmull_pre_\p	v10
360	endif_yield_neon
361
362	b		.L_fold_64_B_loop_\@
363
364.L_fold_64_B_end_\@:
365	// at this point, the buffer pointer is pointing at the last y Bytes
366	// of the buffer the 64B of folded data is in 4 of the vector
367	// registers: v0, v1, v2, v3
368
369	// fold the 8 vector registers to 1 vector register with different
370	// constants
371
372	ldr_l		q10, rk9, x8
373	__pmull_pre_\p	v10
374
375	fold16		\p, v0, rk11
376	fold16		\p, v1, rk13
377	fold16		\p, v2, rk15
378	fold16		\p, v3, rk17
379	fold16		\p, v4, rk19
380	fold16		\p, v5, rk1
381	fold16		\p, v6
382
383	// instead of 64, we add 48 to the loop counter to save 1 instruction
384	// from the loop instead of a cmp instruction, we use the negative
385	// flag with the jl instruction
386	adds		arg3, arg3, #(128-16)
387	b.lt		.L_final_reduction_for_128_\@
388
389	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
390	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
391	// continue folding 16B at a time
392
393.L_16B_reduction_loop_\@:
394	__pmull_\p	v8, v7, v10
395	__pmull_\p	v7, v7, v10, 2
396	eor		v7.16b, v7.16b, v8.16b
397
398	ldr		q0, [arg2], #16
399CPU_LE(	rev64		v0.16b, v0.16b			)
400CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
401	eor		v7.16b, v7.16b, v0.16b
402	subs		arg3, arg3, #16
403
404	// instead of a cmp instruction, we utilize the flags with the
405	// jge instruction equivalent of: cmp arg3, 16-16
406	// check if there is any more 16B in the buffer to be able to fold
407	b.ge		.L_16B_reduction_loop_\@
408
409	// now we have 16+z bytes left to reduce, where 0<= z < 16.
410	// first, we reduce the data in the xmm7 register
411
412.L_final_reduction_for_128_\@:
413	// check if any more data to fold. If not, compute the CRC of
414	// the final 128 bits
415	adds		arg3, arg3, #16
416	b.eq		.L_128_done_\@
417
418	// here we are getting data that is less than 16 bytes.
419	// since we know that there was data before the pointer, we can
420	// offset the input pointer before the actual point, to receive
421	// exactly 16 bytes. after that the registers need to be adjusted.
422.L_get_last_two_regs_\@:
423	add		arg2, arg2, arg3
424	ldr		q1, [arg2, #-16]
425CPU_LE(	rev64		v1.16b, v1.16b			)
426CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
427
428	// get rid of the extra data that was loaded before
429	// load the shift constant
430	adr_l		x4, tbl_shf_table + 16
431	sub		x4, x4, arg3
432	ld1		{v0.16b}, [x4]
433
434	// shift v2 to the left by arg3 bytes
435	tbl		v2.16b, {v7.16b}, v0.16b
436
437	// shift v7 to the right by 16-arg3 bytes
438	movi		v9.16b, #0x80
439	eor		v0.16b, v0.16b, v9.16b
440	tbl		v7.16b, {v7.16b}, v0.16b
441
442	// blend
443	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
444	bsl		v0.16b, v2.16b, v1.16b
445
446	// fold 16 Bytes
447	__pmull_\p	v8, v7, v10
448	__pmull_\p	v7, v7, v10, 2
449	eor		v7.16b, v7.16b, v8.16b
450	eor		v7.16b, v7.16b, v0.16b
451
452.L_128_done_\@:
453	// compute crc of a 128-bit value
454	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
455	__pmull_pre_\p	v10
456
457	// 64b fold
458	ext		v0.16b, vzr.16b, v7.16b, #8
459	mov		v7.d[0], v7.d[1]
460	__pmull_\p	v7, v7, v10
461	eor		v7.16b, v7.16b, v0.16b
462
463	// 32b fold
464	ext		v0.16b, v7.16b, vzr.16b, #4
465	mov		v7.s[3], vzr.s[0]
466	__pmull_\p	v0, v0, v10, 2
467	eor		v7.16b, v7.16b, v0.16b
468
469	// barrett reduction
470	ldr_l		q10, rk7, x8
471	__pmull_pre_\p	v10
472	mov		v0.d[0], v7.d[1]
473
474	__pmull_\p	v0, v0, v10
475	ext		v0.16b, vzr.16b, v0.16b, #12
476	__pmull_\p	v0, v0, v10, 2
477	ext		v0.16b, vzr.16b, v0.16b, #12
478	eor		v7.16b, v7.16b, v0.16b
479	mov		w0, v7.s[1]
480
481.L_cleanup_\@:
482	// scale the result back to 16 bits
483	lsr		x0, x0, #16
484	frame_pop
485	ret
486
487.L_less_than_128_\@:
488	cbz		arg3, .L_cleanup_\@
489
490	movi		v0.16b, #0
491	mov		v0.s[3], arg1_low32	// get the initial crc value
492
493	ldr		q7, [arg2], #0x10
494CPU_LE(	rev64		v7.16b, v7.16b			)
495CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
496	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value
497
498	cmp		arg3, #16
499	b.eq		.L_128_done_\@		// exactly 16 left
500	b.lt		.L_less_than_16_left_\@
501
502	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
503	__pmull_pre_\p	v10
504
505	// update the counter. subtract 32 instead of 16 to save one
506	// instruction from the loop
507	subs		arg3, arg3, #32
508	b.ge		.L_16B_reduction_loop_\@
509
510	add		arg3, arg3, #16
511	b		.L_get_last_two_regs_\@
512
513.L_less_than_16_left_\@:
514	// shl r9, 4
515	adr_l		x0, tbl_shf_table + 16
516	sub		x0, x0, arg3
517	ld1		{v0.16b}, [x0]
518	movi		v9.16b, #0x80
519	eor		v0.16b, v0.16b, v9.16b
520	tbl		v7.16b, {v7.16b}, v0.16b
521	b		.L_128_done_\@
522	.endm
523
524ENTRY(crc_t10dif_pmull_p8)
525	crc_t10dif_pmull	p8
526ENDPROC(crc_t10dif_pmull_p8)
527
528	.align		5
529ENTRY(crc_t10dif_pmull_p64)
530	crc_t10dif_pmull	p64
531ENDPROC(crc_t10dif_pmull_p64)
532
533// precomputed constants
534// these constants are precomputed from the poly:
535// 0x8bb70000 (0x8bb7 scaled to 32 bits)
536	.section	".rodata", "a"
537	.align		4
538// Q = 0x18BB70000
539// rk1 = 2^(32*3) mod Q << 32
540// rk2 = 2^(32*5) mod Q << 32
541// rk3 = 2^(32*15) mod Q << 32
542// rk4 = 2^(32*17) mod Q << 32
543// rk5 = 2^(32*3) mod Q << 32
544// rk6 = 2^(32*2) mod Q << 32
545// rk7 = floor(2^64/Q)
546// rk8 = Q
547
548rk1:	.octa		0x06df0000000000002d56000000000000
549rk3:	.octa		0x7cf50000000000009d9d000000000000
550rk5:	.octa		0x13680000000000002d56000000000000
551rk7:	.octa		0x000000018bb7000000000001f65a57f8
552rk9:	.octa		0xbfd6000000000000ceae000000000000
553rk11:	.octa		0x713c0000000000001e16000000000000
554rk13:	.octa		0x80a6000000000000f7f9000000000000
555rk15:	.octa		0xe658000000000000044c000000000000
556rk17:	.octa		0xa497000000000000ad18000000000000
557rk19:	.octa		0xe7b50000000000006ee3000000000000
558
559tbl_shf_table:
560// use these values for shift constants for the tbl/tbx instruction
561// different alignments result in values as shown:
562//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
563//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
564//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
565//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
566//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
567//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
568//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
569//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
570//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
571//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
572//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
573//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
574//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
575//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
576//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
577
578	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
579	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
580	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
581	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
582