1########################################################################
2# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
3#
4# Copyright (c) 2013, Intel Corporation
5#
6# Authors:
7#     Erdinc Ozturk <erdinc.ozturk@intel.com>
8#     Vinodh Gopal <vinodh.gopal@intel.com>
9#     James Guilford <james.guilford@intel.com>
10#     Tim Chen <tim.c.chen@linux.intel.com>
11#
12# This software is available to you under a choice of one of two
13# licenses.  You may choose to be licensed under the terms of the GNU
14# General Public License (GPL) Version 2, available from the file
15# COPYING in the main directory of this source tree, or the
16# OpenIB.org BSD license below:
17#
18# Redistribution and use in source and binary forms, with or without
19# modification, are permitted provided that the following conditions are
20# met:
21#
22# * Redistributions of source code must retain the above copyright
23#   notice, this list of conditions and the following disclaimer.
24#
25# * Redistributions in binary form must reproduce the above copyright
26#   notice, this list of conditions and the following disclaimer in the
27#   documentation and/or other materials provided with the
28#   distribution.
29#
30# * Neither the name of the Intel Corporation nor the names of its
31#   contributors may be used to endorse or promote products derived from
32#   this software without specific prior written permission.
33#
34#
35# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
36# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
39# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46########################################################################
47#       Function API:
48#       UINT16 crc_t10dif_pcl(
49#               UINT16 init_crc, //initial CRC value, 16 bits
50#               const unsigned char *buf, //buffer pointer to calculate CRC on
51#               UINT64 len //buffer length in bytes (64-bit data)
52#       );
53#
54#       Reference paper titled "Fast CRC Computation for Generic
55#	Polynomials Using PCLMULQDQ Instruction"
56#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
57#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
58#
59#
60
61#include <linux/linkage.h>
62
63.text
64
65#define        arg1 %rdi
66#define        arg2 %rsi
67#define        arg3 %rdx
68
69#define        arg1_low32 %edi
70
71ENTRY(crc_t10dif_pcl)
72.align 16
73
74	# adjust the 16-bit initial_crc value, scale it to 32 bits
75	shl	$16, arg1_low32
76
77	# Allocate Stack Space
78	mov     %rsp, %rcx
79	sub	$16*2, %rsp
80	# align stack to 16 byte boundary
81	and     $~(0x10 - 1), %rsp
82
83	# check if smaller than 256
84	cmp	$256, arg3
85
86	# for sizes less than 128, we can't fold 64B at a time...
87	jl	_less_than_128
88
89
90	# load the initial crc value
91	movd	arg1_low32, %xmm10	# initial crc
92
93	# crc value does not need to be byte-reflected, but it needs
94	# to be moved to the high part of the register.
95	# because data will be byte-reflected and will align with
96	# initial crc at correct place.
97	pslldq	$12, %xmm10
98
99	movdqa  SHUF_MASK(%rip), %xmm11
100	# receive the initial 64B data, xor the initial crc value
101	movdqu	16*0(arg2), %xmm0
102	movdqu	16*1(arg2), %xmm1
103	movdqu	16*2(arg2), %xmm2
104	movdqu	16*3(arg2), %xmm3
105	movdqu	16*4(arg2), %xmm4
106	movdqu	16*5(arg2), %xmm5
107	movdqu	16*6(arg2), %xmm6
108	movdqu	16*7(arg2), %xmm7
109
110	pshufb	%xmm11, %xmm0
111	# XOR the initial_crc value
112	pxor	%xmm10, %xmm0
113	pshufb	%xmm11, %xmm1
114	pshufb	%xmm11, %xmm2
115	pshufb	%xmm11, %xmm3
116	pshufb	%xmm11, %xmm4
117	pshufb	%xmm11, %xmm5
118	pshufb	%xmm11, %xmm6
119	pshufb	%xmm11, %xmm7
120
121	movdqa	rk3(%rip), %xmm10	#xmm10 has rk3 and rk4
122					#imm value of pclmulqdq instruction
123					#will determine which constant to use
124
125	#################################################################
126	# we subtract 256 instead of 128 to save one instruction from the loop
127	sub	$256, arg3
128
129	# at this section of the code, there is 64*x+y (0<=y<64) bytes of
130	# buffer. The _fold_64_B_loop will fold 64B at a time
131	# until we have 64+y Bytes of buffer
132
133
134	# fold 64B at a time. This section of the code folds 4 xmm
135	# registers in parallel
136_fold_64_B_loop:
137
138	# update the buffer pointer
139	add	$128, arg2		#    buf += 64#
140
141	movdqu	16*0(arg2), %xmm9
142	movdqu	16*1(arg2), %xmm12
143	pshufb	%xmm11, %xmm9
144	pshufb	%xmm11, %xmm12
145	movdqa	%xmm0, %xmm8
146	movdqa	%xmm1, %xmm13
147	pclmulqdq	$0x0 , %xmm10, %xmm0
148	pclmulqdq	$0x11, %xmm10, %xmm8
149	pclmulqdq	$0x0 , %xmm10, %xmm1
150	pclmulqdq	$0x11, %xmm10, %xmm13
151	pxor	%xmm9 , %xmm0
152	xorps	%xmm8 , %xmm0
153	pxor	%xmm12, %xmm1
154	xorps	%xmm13, %xmm1
155
156	movdqu	16*2(arg2), %xmm9
157	movdqu	16*3(arg2), %xmm12
158	pshufb	%xmm11, %xmm9
159	pshufb	%xmm11, %xmm12
160	movdqa	%xmm2, %xmm8
161	movdqa	%xmm3, %xmm13
162	pclmulqdq	$0x0, %xmm10, %xmm2
163	pclmulqdq	$0x11, %xmm10, %xmm8
164	pclmulqdq	$0x0, %xmm10, %xmm3
165	pclmulqdq	$0x11, %xmm10, %xmm13
166	pxor	%xmm9 , %xmm2
167	xorps	%xmm8 , %xmm2
168	pxor	%xmm12, %xmm3
169	xorps	%xmm13, %xmm3
170
171	movdqu	16*4(arg2), %xmm9
172	movdqu	16*5(arg2), %xmm12
173	pshufb	%xmm11, %xmm9
174	pshufb	%xmm11, %xmm12
175	movdqa	%xmm4, %xmm8
176	movdqa	%xmm5, %xmm13
177	pclmulqdq	$0x0,  %xmm10, %xmm4
178	pclmulqdq	$0x11, %xmm10, %xmm8
179	pclmulqdq	$0x0,  %xmm10, %xmm5
180	pclmulqdq	$0x11, %xmm10, %xmm13
181	pxor	%xmm9 ,  %xmm4
182	xorps	%xmm8 ,  %xmm4
183	pxor	%xmm12,  %xmm5
184	xorps	%xmm13,  %xmm5
185
186	movdqu	16*6(arg2), %xmm9
187	movdqu	16*7(arg2), %xmm12
188	pshufb	%xmm11, %xmm9
189	pshufb	%xmm11, %xmm12
190	movdqa	%xmm6 , %xmm8
191	movdqa	%xmm7 , %xmm13
192	pclmulqdq	$0x0 , %xmm10, %xmm6
193	pclmulqdq	$0x11, %xmm10, %xmm8
194	pclmulqdq	$0x0 , %xmm10, %xmm7
195	pclmulqdq	$0x11, %xmm10, %xmm13
196	pxor	%xmm9 , %xmm6
197	xorps	%xmm8 , %xmm6
198	pxor	%xmm12, %xmm7
199	xorps	%xmm13, %xmm7
200
201	sub	$128, arg3
202
203	# check if there is another 64B in the buffer to be able to fold
204	jge	_fold_64_B_loop
205	##################################################################
206
207
208	add	$128, arg2
209	# at this point, the buffer pointer is pointing at the last y Bytes
210	# of the buffer the 64B of folded data is in 4 of the xmm
211	# registers: xmm0, xmm1, xmm2, xmm3
212
213
214	# fold the 8 xmm registers to 1 xmm register with different constants
215
216	movdqa	rk9(%rip), %xmm10
217	movdqa	%xmm0, %xmm8
218	pclmulqdq	$0x11, %xmm10, %xmm0
219	pclmulqdq	$0x0 , %xmm10, %xmm8
220	pxor	%xmm8, %xmm7
221	xorps	%xmm0, %xmm7
222
223	movdqa	rk11(%rip), %xmm10
224	movdqa	%xmm1, %xmm8
225	pclmulqdq	 $0x11, %xmm10, %xmm1
226	pclmulqdq	 $0x0 , %xmm10, %xmm8
227	pxor	%xmm8, %xmm7
228	xorps	%xmm1, %xmm7
229
230	movdqa	rk13(%rip), %xmm10
231	movdqa	%xmm2, %xmm8
232	pclmulqdq	 $0x11, %xmm10, %xmm2
233	pclmulqdq	 $0x0 , %xmm10, %xmm8
234	pxor	%xmm8, %xmm7
235	pxor	%xmm2, %xmm7
236
237	movdqa	rk15(%rip), %xmm10
238	movdqa	%xmm3, %xmm8
239	pclmulqdq	$0x11, %xmm10, %xmm3
240	pclmulqdq	$0x0 , %xmm10, %xmm8
241	pxor	%xmm8, %xmm7
242	xorps	%xmm3, %xmm7
243
244	movdqa	rk17(%rip), %xmm10
245	movdqa	%xmm4, %xmm8
246	pclmulqdq	$0x11, %xmm10, %xmm4
247	pclmulqdq	$0x0 , %xmm10, %xmm8
248	pxor	%xmm8, %xmm7
249	pxor	%xmm4, %xmm7
250
251	movdqa	rk19(%rip), %xmm10
252	movdqa	%xmm5, %xmm8
253	pclmulqdq	$0x11, %xmm10, %xmm5
254	pclmulqdq	$0x0 , %xmm10, %xmm8
255	pxor	%xmm8, %xmm7
256	xorps	%xmm5, %xmm7
257
258	movdqa	rk1(%rip), %xmm10	#xmm10 has rk1 and rk2
259					#imm value of pclmulqdq instruction
260					#will determine which constant to use
261	movdqa	%xmm6, %xmm8
262	pclmulqdq	$0x11, %xmm10, %xmm6
263	pclmulqdq	$0x0 , %xmm10, %xmm8
264	pxor	%xmm8, %xmm7
265	pxor	%xmm6, %xmm7
266
267
268	# instead of 64, we add 48 to the loop counter to save 1 instruction
269	# from the loop instead of a cmp instruction, we use the negative
270	# flag with the jl instruction
271	add	$128-16, arg3
272	jl	_final_reduction_for_128
273
274	# now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
275	# and the rest is in memory. We can fold 16 bytes at a time if y>=16
276	# continue folding 16B at a time
277
278_16B_reduction_loop:
279	movdqa	%xmm7, %xmm8
280	pclmulqdq	$0x11, %xmm10, %xmm7
281	pclmulqdq	$0x0 , %xmm10, %xmm8
282	pxor	%xmm8, %xmm7
283	movdqu	(arg2), %xmm0
284	pshufb	%xmm11, %xmm0
285	pxor	%xmm0 , %xmm7
286	add	$16, arg2
287	sub	$16, arg3
288	# instead of a cmp instruction, we utilize the flags with the
289	# jge instruction equivalent of: cmp arg3, 16-16
290	# check if there is any more 16B in the buffer to be able to fold
291	jge	_16B_reduction_loop
292
293	#now we have 16+z bytes left to reduce, where 0<= z < 16.
294	#first, we reduce the data in the xmm7 register
295
296
297_final_reduction_for_128:
298	# check if any more data to fold. If not, compute the CRC of
299	# the final 128 bits
300	add	$16, arg3
301	je	_128_done
302
303	# here we are getting data that is less than 16 bytes.
304	# since we know that there was data before the pointer, we can
305	# offset the input pointer before the actual point, to receive
306	# exactly 16 bytes. after that the registers need to be adjusted.
307_get_last_two_xmms:
308	movdqa	%xmm7, %xmm2
309
310	movdqu	-16(arg2, arg3), %xmm1
311	pshufb	%xmm11, %xmm1
312
313	# get rid of the extra data that was loaded before
314	# load the shift constant
315	lea	pshufb_shf_table+16(%rip), %rax
316	sub	arg3, %rax
317	movdqu	(%rax), %xmm0
318
319	# shift xmm2 to the left by arg3 bytes
320	pshufb	%xmm0, %xmm2
321
322	# shift xmm7 to the right by 16-arg3 bytes
323	pxor	mask1(%rip), %xmm0
324	pshufb	%xmm0, %xmm7
325	pblendvb	%xmm2, %xmm1	#xmm0 is implicit
326
327	# fold 16 Bytes
328	movdqa	%xmm1, %xmm2
329	movdqa	%xmm7, %xmm8
330	pclmulqdq	$0x11, %xmm10, %xmm7
331	pclmulqdq	$0x0 , %xmm10, %xmm8
332	pxor	%xmm8, %xmm7
333	pxor	%xmm2, %xmm7
334
335_128_done:
336	# compute crc of a 128-bit value
337	movdqa	rk5(%rip), %xmm10	# rk5 and rk6 in xmm10
338	movdqa	%xmm7, %xmm0
339
340	#64b fold
341	pclmulqdq	$0x1, %xmm10, %xmm7
342	pslldq	$8   ,  %xmm0
343	pxor	%xmm0,  %xmm7
344
345	#32b fold
346	movdqa	%xmm7, %xmm0
347
348	pand	mask2(%rip), %xmm0
349
350	psrldq	$12, %xmm7
351	pclmulqdq	$0x10, %xmm10, %xmm7
352	pxor	%xmm0, %xmm7
353
354	#barrett reduction
355_barrett:
356	movdqa	rk7(%rip), %xmm10	# rk7 and rk8 in xmm10
357	movdqa	%xmm7, %xmm0
358	pclmulqdq	$0x01, %xmm10, %xmm7
359	pslldq	$4, %xmm7
360	pclmulqdq	$0x11, %xmm10, %xmm7
361
362	pslldq	$4, %xmm7
363	pxor	%xmm0, %xmm7
364	pextrd	$1, %xmm7, %eax
365
366_cleanup:
367	# scale the result back to 16 bits
368	shr	$16, %eax
369	mov     %rcx, %rsp
370	ret
371
372########################################################################
373
374.align 16
375_less_than_128:
376
377	# check if there is enough buffer to be able to fold 16B at a time
378	cmp	$32, arg3
379	jl	_less_than_32
380	movdqa  SHUF_MASK(%rip), %xmm11
381
382	# now if there is, load the constants
383	movdqa	rk1(%rip), %xmm10	# rk1 and rk2 in xmm10
384
385	movd	arg1_low32, %xmm0	# get the initial crc value
386	pslldq	$12, %xmm0	# align it to its correct place
387	movdqu	(arg2), %xmm7	# load the plaintext
388	pshufb	%xmm11, %xmm7	# byte-reflect the plaintext
389	pxor	%xmm0, %xmm7
390
391
392	# update the buffer pointer
393	add	$16, arg2
394
395	# update the counter. subtract 32 instead of 16 to save one
396	# instruction from the loop
397	sub	$32, arg3
398
399	jmp	_16B_reduction_loop
400
401
402.align 16
403_less_than_32:
404	# mov initial crc to the return value. this is necessary for
405	# zero-length buffers.
406	mov	arg1_low32, %eax
407	test	arg3, arg3
408	je	_cleanup
409
410	movdqa  SHUF_MASK(%rip), %xmm11
411
412	movd	arg1_low32, %xmm0	# get the initial crc value
413	pslldq	$12, %xmm0	# align it to its correct place
414
415	cmp	$16, arg3
416	je	_exact_16_left
417	jl	_less_than_16_left
418
419	movdqu	(arg2), %xmm7	# load the plaintext
420	pshufb	%xmm11, %xmm7	# byte-reflect the plaintext
421	pxor	%xmm0 , %xmm7	# xor the initial crc value
422	add	$16, arg2
423	sub	$16, arg3
424	movdqa	rk1(%rip), %xmm10	# rk1 and rk2 in xmm10
425	jmp	_get_last_two_xmms
426
427
428.align 16
429_less_than_16_left:
430	# use stack space to load data less than 16 bytes, zero-out
431	# the 16B in memory first.
432
433	pxor	%xmm1, %xmm1
434	mov	%rsp, %r11
435	movdqa	%xmm1, (%r11)
436
437	cmp	$4, arg3
438	jl	_only_less_than_4
439
440	# backup the counter value
441	mov	arg3, %r9
442	cmp	$8, arg3
443	jl	_less_than_8_left
444
445	# load 8 Bytes
446	mov	(arg2), %rax
447	mov	%rax, (%r11)
448	add	$8, %r11
449	sub	$8, arg3
450	add	$8, arg2
451_less_than_8_left:
452
453	cmp	$4, arg3
454	jl	_less_than_4_left
455
456	# load 4 Bytes
457	mov	(arg2), %eax
458	mov	%eax, (%r11)
459	add	$4, %r11
460	sub	$4, arg3
461	add	$4, arg2
462_less_than_4_left:
463
464	cmp	$2, arg3
465	jl	_less_than_2_left
466
467	# load 2 Bytes
468	mov	(arg2), %ax
469	mov	%ax, (%r11)
470	add	$2, %r11
471	sub	$2, arg3
472	add	$2, arg2
473_less_than_2_left:
474	cmp     $1, arg3
475        jl      _zero_left
476
477	# load 1 Byte
478	mov	(arg2), %al
479	mov	%al, (%r11)
480_zero_left:
481	movdqa	(%rsp), %xmm7
482	pshufb	%xmm11, %xmm7
483	pxor	%xmm0 , %xmm7	# xor the initial crc value
484
485	# shl r9, 4
486	lea	pshufb_shf_table+16(%rip), %rax
487	sub	%r9, %rax
488	movdqu	(%rax), %xmm0
489	pxor	mask1(%rip), %xmm0
490
491	pshufb	%xmm0, %xmm7
492	jmp	_128_done
493
494.align 16
495_exact_16_left:
496	movdqu	(arg2), %xmm7
497	pshufb	%xmm11, %xmm7
498	pxor	%xmm0 , %xmm7   # xor the initial crc value
499
500	jmp	_128_done
501
502_only_less_than_4:
503	cmp	$3, arg3
504	jl	_only_less_than_3
505
506	# load 3 Bytes
507	mov	(arg2), %al
508	mov	%al, (%r11)
509
510	mov	1(arg2), %al
511	mov	%al, 1(%r11)
512
513	mov	2(arg2), %al
514	mov	%al, 2(%r11)
515
516	movdqa	 (%rsp), %xmm7
517	pshufb	 %xmm11, %xmm7
518	pxor	 %xmm0 , %xmm7  # xor the initial crc value
519
520	psrldq	$5, %xmm7
521
522	jmp	_barrett
523_only_less_than_3:
524	cmp	$2, arg3
525	jl	_only_less_than_2
526
527	# load 2 Bytes
528	mov	(arg2), %al
529	mov	%al, (%r11)
530
531	mov	1(arg2), %al
532	mov	%al, 1(%r11)
533
534	movdqa	(%rsp), %xmm7
535	pshufb	%xmm11, %xmm7
536	pxor	%xmm0 , %xmm7   # xor the initial crc value
537
538	psrldq	$6, %xmm7
539
540	jmp	_barrett
541_only_less_than_2:
542
543	# load 1 Byte
544	mov	(arg2), %al
545	mov	%al, (%r11)
546
547	movdqa	(%rsp), %xmm7
548	pshufb	%xmm11, %xmm7
549	pxor	%xmm0 , %xmm7   # xor the initial crc value
550
551	psrldq	$7, %xmm7
552
553	jmp	_barrett
554
555ENDPROC(crc_t10dif_pcl)
556
557.section	.rodata, "a", @progbits
558.align 16
559# precomputed constants
560# these constants are precomputed from the poly:
561# 0x8bb70000 (0x8bb7 scaled to 32 bits)
562# Q = 0x18BB70000
563# rk1 = 2^(32*3) mod Q << 32
564# rk2 = 2^(32*5) mod Q << 32
565# rk3 = 2^(32*15) mod Q << 32
566# rk4 = 2^(32*17) mod Q << 32
567# rk5 = 2^(32*3) mod Q << 32
568# rk6 = 2^(32*2) mod Q << 32
569# rk7 = floor(2^64/Q)
570# rk8 = Q
571rk1:
572.quad 0x2d56000000000000
573rk2:
574.quad 0x06df000000000000
575rk3:
576.quad 0x9d9d000000000000
577rk4:
578.quad 0x7cf5000000000000
579rk5:
580.quad 0x2d56000000000000
581rk6:
582.quad 0x1368000000000000
583rk7:
584.quad 0x00000001f65a57f8
585rk8:
586.quad 0x000000018bb70000
587
588rk9:
589.quad 0xceae000000000000
590rk10:
591.quad 0xbfd6000000000000
592rk11:
593.quad 0x1e16000000000000
594rk12:
595.quad 0x713c000000000000
596rk13:
597.quad 0xf7f9000000000000
598rk14:
599.quad 0x80a6000000000000
600rk15:
601.quad 0x044c000000000000
602rk16:
603.quad 0xe658000000000000
604rk17:
605.quad 0xad18000000000000
606rk18:
607.quad 0xa497000000000000
608rk19:
609.quad 0x6ee3000000000000
610rk20:
611.quad 0xe7b5000000000000
612
613
614
615.section	.rodata.cst16.mask1, "aM", @progbits, 16
616.align 16
617mask1:
618.octa 0x80808080808080808080808080808080
619
620.section	.rodata.cst16.mask2, "aM", @progbits, 16
621.align 16
622mask2:
623.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
624
625.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
626.align 16
627SHUF_MASK:
628.octa 0x000102030405060708090A0B0C0D0E0F
629
630.section	.rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
631.align 32
632pshufb_shf_table:
633# use these values for shift constants for the pshufb instruction
634# different alignments result in values as shown:
635#	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
636#	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
637#	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
638#	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
639#	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
640#	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
641#	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
642#	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
643#	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
644#	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
645#	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
646#	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
647#	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
648#	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
649#	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
650.octa 0x8f8e8d8c8b8a89888786858483828100
651.octa 0x000e0d0c0b0a09080706050403020100
652