1// 2// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions 3// 4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5// 6// This program is free software; you can redistribute it and/or modify 7// it under the terms of the GNU General Public License version 2 as 8// published by the Free Software Foundation. 9// 10 11// 12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 13// 14// Copyright (c) 2013, Intel Corporation 15// 16// Authors: 17// Erdinc Ozturk <erdinc.ozturk@intel.com> 18// Vinodh Gopal <vinodh.gopal@intel.com> 19// James Guilford <james.guilford@intel.com> 20// Tim Chen <tim.c.chen@linux.intel.com> 21// 22// This software is available to you under a choice of one of two 23// licenses. You may choose to be licensed under the terms of the GNU 24// General Public License (GPL) Version 2, available from the file 25// COPYING in the main directory of this source tree, or the 26// OpenIB.org BSD license below: 27// 28// Redistribution and use in source and binary forms, with or without 29// modification, are permitted provided that the following conditions are 30// met: 31// 32// * Redistributions of source code must retain the above copyright 33// notice, this list of conditions and the following disclaimer. 34// 35// * Redistributions in binary form must reproduce the above copyright 36// notice, this list of conditions and the following disclaimer in the 37// documentation and/or other materials provided with the 38// distribution. 39// 40// * Neither the name of the Intel Corporation nor the names of its 41// contributors may be used to endorse or promote products derived from 42// this software without specific prior written permission. 43// 44// 45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 56// 57// Function API: 58// UINT16 crc_t10dif_pcl( 59// UINT16 init_crc, //initial CRC value, 16 bits 60// const unsigned char *buf, //buffer pointer to calculate CRC on 61// UINT64 len //buffer length in bytes (64-bit data) 62// ); 63// 64// Reference paper titled "Fast CRC Computation for Generic 65// Polynomials Using PCLMULQDQ Instruction" 66// URL: http://www.intel.com/content/dam/www/public/us/en/documents 67// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 68// 69// 70 71#include <linux/linkage.h> 72#include <asm/assembler.h> 73 74 .text 75 .cpu generic+crypto 76 77 arg1_low32 .req w19 78 arg2 .req x20 79 arg3 .req x21 80 81 vzr .req v13 82 83 ad .req v14 84 bd .req v10 85 86 k00_16 .req v15 87 k32_48 .req v16 88 89 t3 .req v17 90 t4 .req v18 91 t5 .req v19 92 t6 .req v20 93 t7 .req v21 94 t8 .req v22 95 t9 .req v23 96 97 perm1 .req v24 98 perm2 .req v25 99 perm3 .req v26 100 perm4 .req v27 101 102 bd1 .req v28 103 bd2 .req v29 104 bd3 .req v30 105 bd4 .req v31 106 107 .macro __pmull_init_p64 108 .endm 109 110 .macro __pmull_pre_p64, bd 111 .endm 112 113 .macro __pmull_init_p8 114 // k00_16 := 0x0000000000000000_000000000000ffff 115 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 116 movi k32_48.2d, #0xffffffff 117 mov k32_48.h[2], k32_48.h[0] 118 ushr k00_16.2d, k32_48.2d, #32 119 120 // prepare the permutation vectors 121 mov_q x5, 0x080f0e0d0c0b0a09 122 movi perm4.8b, #8 123 dup perm1.2d, x5 124 eor perm1.16b, perm1.16b, perm4.16b 125 ushr perm2.2d, perm1.2d, #8 126 ushr perm3.2d, perm1.2d, #16 127 ushr perm4.2d, perm1.2d, #24 128 sli perm2.2d, perm1.2d, #56 129 sli perm3.2d, perm1.2d, #48 130 sli perm4.2d, perm1.2d, #40 131 .endm 132 133 .macro __pmull_pre_p8, bd 134 tbl bd1.16b, {\bd\().16b}, perm1.16b 135 tbl bd2.16b, {\bd\().16b}, perm2.16b 136 tbl bd3.16b, {\bd\().16b}, perm3.16b 137 tbl bd4.16b, {\bd\().16b}, perm4.16b 138 .endm 139 140__pmull_p8_core: 141.L__pmull_p8_core: 142 ext t4.8b, ad.8b, ad.8b, #1 // A1 143 ext t5.8b, ad.8b, ad.8b, #2 // A2 144 ext t6.8b, ad.8b, ad.8b, #3 // A3 145 146 pmull t4.8h, t4.8b, bd.8b // F = A1*B 147 pmull t8.8h, ad.8b, bd1.8b // E = A*B1 148 pmull t5.8h, t5.8b, bd.8b // H = A2*B 149 pmull t7.8h, ad.8b, bd2.8b // G = A*B2 150 pmull t6.8h, t6.8b, bd.8b // J = A3*B 151 pmull t9.8h, ad.8b, bd3.8b // I = A*B3 152 pmull t3.8h, ad.8b, bd4.8b // K = A*B4 153 b 0f 154 155.L__pmull_p8_core2: 156 tbl t4.16b, {ad.16b}, perm1.16b // A1 157 tbl t5.16b, {ad.16b}, perm2.16b // A2 158 tbl t6.16b, {ad.16b}, perm3.16b // A3 159 160 pmull2 t4.8h, t4.16b, bd.16b // F = A1*B 161 pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 162 pmull2 t5.8h, t5.16b, bd.16b // H = A2*B 163 pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 164 pmull2 t6.8h, t6.16b, bd.16b // J = A3*B 165 pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 166 pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 167 1680: eor t4.16b, t4.16b, t8.16b // L = E + F 169 eor t5.16b, t5.16b, t7.16b // M = G + H 170 eor t6.16b, t6.16b, t9.16b // N = I + J 171 172 uzp1 t8.2d, t4.2d, t5.2d 173 uzp2 t4.2d, t4.2d, t5.2d 174 uzp1 t7.2d, t6.2d, t3.2d 175 uzp2 t6.2d, t6.2d, t3.2d 176 177 // t4 = (L) (P0 + P1) << 8 178 // t5 = (M) (P2 + P3) << 16 179 eor t8.16b, t8.16b, t4.16b 180 and t4.16b, t4.16b, k32_48.16b 181 182 // t6 = (N) (P4 + P5) << 24 183 // t7 = (K) (P6 + P7) << 32 184 eor t7.16b, t7.16b, t6.16b 185 and t6.16b, t6.16b, k00_16.16b 186 187 eor t8.16b, t8.16b, t4.16b 188 eor t7.16b, t7.16b, t6.16b 189 190 zip2 t5.2d, t8.2d, t4.2d 191 zip1 t4.2d, t8.2d, t4.2d 192 zip2 t3.2d, t7.2d, t6.2d 193 zip1 t6.2d, t7.2d, t6.2d 194 195 ext t4.16b, t4.16b, t4.16b, #15 196 ext t5.16b, t5.16b, t5.16b, #14 197 ext t6.16b, t6.16b, t6.16b, #13 198 ext t3.16b, t3.16b, t3.16b, #12 199 200 eor t4.16b, t4.16b, t5.16b 201 eor t6.16b, t6.16b, t3.16b 202 ret 203ENDPROC(__pmull_p8_core) 204 205 .macro __pmull_p8, rq, ad, bd, i 206 .ifnc \bd, v10 207 .err 208 .endif 209 mov ad.16b, \ad\().16b 210 .ifb \i 211 pmull \rq\().8h, \ad\().8b, bd.8b // D = A*B 212 .else 213 pmull2 \rq\().8h, \ad\().16b, bd.16b // D = A*B 214 .endif 215 216 bl .L__pmull_p8_core\i 217 218 eor \rq\().16b, \rq\().16b, t4.16b 219 eor \rq\().16b, \rq\().16b, t6.16b 220 .endm 221 222 .macro fold64, p, reg1, reg2 223 ldp q11, q12, [arg2], #0x20 224 225 __pmull_\p v8, \reg1, v10, 2 226 __pmull_\p \reg1, \reg1, v10 227 228CPU_LE( rev64 v11.16b, v11.16b ) 229CPU_LE( rev64 v12.16b, v12.16b ) 230 231 __pmull_\p v9, \reg2, v10, 2 232 __pmull_\p \reg2, \reg2, v10 233 234CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) 235CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) 236 237 eor \reg1\().16b, \reg1\().16b, v8.16b 238 eor \reg2\().16b, \reg2\().16b, v9.16b 239 eor \reg1\().16b, \reg1\().16b, v11.16b 240 eor \reg2\().16b, \reg2\().16b, v12.16b 241 .endm 242 243 .macro fold16, p, reg, rk 244 __pmull_\p v8, \reg, v10 245 __pmull_\p \reg, \reg, v10, 2 246 .ifnb \rk 247 ldr_l q10, \rk, x8 248 __pmull_pre_\p v10 249 .endif 250 eor v7.16b, v7.16b, v8.16b 251 eor v7.16b, v7.16b, \reg\().16b 252 .endm 253 254 .macro __pmull_p64, rd, rn, rm, n 255 .ifb \n 256 pmull \rd\().1q, \rn\().1d, \rm\().1d 257 .else 258 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 259 .endif 260 .endm 261 262 .macro crc_t10dif_pmull, p 263 frame_push 3, 128 264 265 mov arg1_low32, w0 266 mov arg2, x1 267 mov arg3, x2 268 269 movi vzr.16b, #0 // init zero register 270 271 __pmull_init_\p 272 273 // adjust the 16-bit initial_crc value, scale it to 32 bits 274 lsl arg1_low32, arg1_low32, #16 275 276 // check if smaller than 256 277 cmp arg3, #256 278 279 // for sizes less than 128, we can't fold 64B at a time... 280 b.lt .L_less_than_128_\@ 281 282 // load the initial crc value 283 // crc value does not need to be byte-reflected, but it needs 284 // to be moved to the high part of the register. 285 // because data will be byte-reflected and will align with 286 // initial crc at correct place. 287 movi v10.16b, #0 288 mov v10.s[3], arg1_low32 // initial crc 289 290 // receive the initial 64B data, xor the initial crc value 291 ldp q0, q1, [arg2] 292 ldp q2, q3, [arg2, #0x20] 293 ldp q4, q5, [arg2, #0x40] 294 ldp q6, q7, [arg2, #0x60] 295 add arg2, arg2, #0x80 296 297CPU_LE( rev64 v0.16b, v0.16b ) 298CPU_LE( rev64 v1.16b, v1.16b ) 299CPU_LE( rev64 v2.16b, v2.16b ) 300CPU_LE( rev64 v3.16b, v3.16b ) 301CPU_LE( rev64 v4.16b, v4.16b ) 302CPU_LE( rev64 v5.16b, v5.16b ) 303CPU_LE( rev64 v6.16b, v6.16b ) 304CPU_LE( rev64 v7.16b, v7.16b ) 305 306CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 307CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) 308CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) 309CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) 310CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) 311CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) 312CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) 313CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 314 315 // XOR the initial_crc value 316 eor v0.16b, v0.16b, v10.16b 317 318 ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4 319 // type of pmull instruction 320 // will determine which constant to use 321 __pmull_pre_\p v10 322 323 // 324 // we subtract 256 instead of 128 to save one instruction from the loop 325 // 326 sub arg3, arg3, #256 327 328 // at this section of the code, there is 64*x+y (0<=y<64) bytes of 329 // buffer. The _fold_64_B_loop will fold 64B at a time 330 // until we have 64+y Bytes of buffer 331 332 // fold 64B at a time. This section of the code folds 4 vector 333 // registers in parallel 334.L_fold_64_B_loop_\@: 335 336 fold64 \p, v0, v1 337 fold64 \p, v2, v3 338 fold64 \p, v4, v5 339 fold64 \p, v6, v7 340 341 subs arg3, arg3, #128 342 343 // check if there is another 64B in the buffer to be able to fold 344 b.lt .L_fold_64_B_end_\@ 345 346 if_will_cond_yield_neon 347 stp q0, q1, [sp, #.Lframe_local_offset] 348 stp q2, q3, [sp, #.Lframe_local_offset + 32] 349 stp q4, q5, [sp, #.Lframe_local_offset + 64] 350 stp q6, q7, [sp, #.Lframe_local_offset + 96] 351 do_cond_yield_neon 352 ldp q0, q1, [sp, #.Lframe_local_offset] 353 ldp q2, q3, [sp, #.Lframe_local_offset + 32] 354 ldp q4, q5, [sp, #.Lframe_local_offset + 64] 355 ldp q6, q7, [sp, #.Lframe_local_offset + 96] 356 ldr_l q10, rk3, x8 357 movi vzr.16b, #0 // init zero register 358 __pmull_init_\p 359 __pmull_pre_\p v10 360 endif_yield_neon 361 362 b .L_fold_64_B_loop_\@ 363 364.L_fold_64_B_end_\@: 365 // at this point, the buffer pointer is pointing at the last y Bytes 366 // of the buffer the 64B of folded data is in 4 of the vector 367 // registers: v0, v1, v2, v3 368 369 // fold the 8 vector registers to 1 vector register with different 370 // constants 371 372 ldr_l q10, rk9, x8 373 __pmull_pre_\p v10 374 375 fold16 \p, v0, rk11 376 fold16 \p, v1, rk13 377 fold16 \p, v2, rk15 378 fold16 \p, v3, rk17 379 fold16 \p, v4, rk19 380 fold16 \p, v5, rk1 381 fold16 \p, v6 382 383 // instead of 64, we add 48 to the loop counter to save 1 instruction 384 // from the loop instead of a cmp instruction, we use the negative 385 // flag with the jl instruction 386 adds arg3, arg3, #(128-16) 387 b.lt .L_final_reduction_for_128_\@ 388 389 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 390 // and the rest is in memory. We can fold 16 bytes at a time if y>=16 391 // continue folding 16B at a time 392 393.L_16B_reduction_loop_\@: 394 __pmull_\p v8, v7, v10 395 __pmull_\p v7, v7, v10, 2 396 eor v7.16b, v7.16b, v8.16b 397 398 ldr q0, [arg2], #16 399CPU_LE( rev64 v0.16b, v0.16b ) 400CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 401 eor v7.16b, v7.16b, v0.16b 402 subs arg3, arg3, #16 403 404 // instead of a cmp instruction, we utilize the flags with the 405 // jge instruction equivalent of: cmp arg3, 16-16 406 // check if there is any more 16B in the buffer to be able to fold 407 b.ge .L_16B_reduction_loop_\@ 408 409 // now we have 16+z bytes left to reduce, where 0<= z < 16. 410 // first, we reduce the data in the xmm7 register 411 412.L_final_reduction_for_128_\@: 413 // check if any more data to fold. If not, compute the CRC of 414 // the final 128 bits 415 adds arg3, arg3, #16 416 b.eq .L_128_done_\@ 417 418 // here we are getting data that is less than 16 bytes. 419 // since we know that there was data before the pointer, we can 420 // offset the input pointer before the actual point, to receive 421 // exactly 16 bytes. after that the registers need to be adjusted. 422.L_get_last_two_regs_\@: 423 add arg2, arg2, arg3 424 ldr q1, [arg2, #-16] 425CPU_LE( rev64 v1.16b, v1.16b ) 426CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) 427 428 // get rid of the extra data that was loaded before 429 // load the shift constant 430 adr_l x4, tbl_shf_table + 16 431 sub x4, x4, arg3 432 ld1 {v0.16b}, [x4] 433 434 // shift v2 to the left by arg3 bytes 435 tbl v2.16b, {v7.16b}, v0.16b 436 437 // shift v7 to the right by 16-arg3 bytes 438 movi v9.16b, #0x80 439 eor v0.16b, v0.16b, v9.16b 440 tbl v7.16b, {v7.16b}, v0.16b 441 442 // blend 443 sshr v0.16b, v0.16b, #7 // convert to 8-bit mask 444 bsl v0.16b, v2.16b, v1.16b 445 446 // fold 16 Bytes 447 __pmull_\p v8, v7, v10 448 __pmull_\p v7, v7, v10, 2 449 eor v7.16b, v7.16b, v8.16b 450 eor v7.16b, v7.16b, v0.16b 451 452.L_128_done_\@: 453 // compute crc of a 128-bit value 454 ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10 455 __pmull_pre_\p v10 456 457 // 64b fold 458 ext v0.16b, vzr.16b, v7.16b, #8 459 mov v7.d[0], v7.d[1] 460 __pmull_\p v7, v7, v10 461 eor v7.16b, v7.16b, v0.16b 462 463 // 32b fold 464 ext v0.16b, v7.16b, vzr.16b, #4 465 mov v7.s[3], vzr.s[0] 466 __pmull_\p v0, v0, v10, 2 467 eor v7.16b, v7.16b, v0.16b 468 469 // barrett reduction 470 ldr_l q10, rk7, x8 471 __pmull_pre_\p v10 472 mov v0.d[0], v7.d[1] 473 474 __pmull_\p v0, v0, v10 475 ext v0.16b, vzr.16b, v0.16b, #12 476 __pmull_\p v0, v0, v10, 2 477 ext v0.16b, vzr.16b, v0.16b, #12 478 eor v7.16b, v7.16b, v0.16b 479 mov w0, v7.s[1] 480 481.L_cleanup_\@: 482 // scale the result back to 16 bits 483 lsr x0, x0, #16 484 frame_pop 485 ret 486 487.L_less_than_128_\@: 488 cbz arg3, .L_cleanup_\@ 489 490 movi v0.16b, #0 491 mov v0.s[3], arg1_low32 // get the initial crc value 492 493 ldr q7, [arg2], #0x10 494CPU_LE( rev64 v7.16b, v7.16b ) 495CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 496 eor v7.16b, v7.16b, v0.16b // xor the initial crc value 497 498 cmp arg3, #16 499 b.eq .L_128_done_\@ // exactly 16 left 500 b.lt .L_less_than_16_left_\@ 501 502 ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10 503 __pmull_pre_\p v10 504 505 // update the counter. subtract 32 instead of 16 to save one 506 // instruction from the loop 507 subs arg3, arg3, #32 508 b.ge .L_16B_reduction_loop_\@ 509 510 add arg3, arg3, #16 511 b .L_get_last_two_regs_\@ 512 513.L_less_than_16_left_\@: 514 // shl r9, 4 515 adr_l x0, tbl_shf_table + 16 516 sub x0, x0, arg3 517 ld1 {v0.16b}, [x0] 518 movi v9.16b, #0x80 519 eor v0.16b, v0.16b, v9.16b 520 tbl v7.16b, {v7.16b}, v0.16b 521 b .L_128_done_\@ 522 .endm 523 524ENTRY(crc_t10dif_pmull_p8) 525 crc_t10dif_pmull p8 526ENDPROC(crc_t10dif_pmull_p8) 527 528 .align 5 529ENTRY(crc_t10dif_pmull_p64) 530 crc_t10dif_pmull p64 531ENDPROC(crc_t10dif_pmull_p64) 532 533// precomputed constants 534// these constants are precomputed from the poly: 535// 0x8bb70000 (0x8bb7 scaled to 32 bits) 536 .section ".rodata", "a" 537 .align 4 538// Q = 0x18BB70000 539// rk1 = 2^(32*3) mod Q << 32 540// rk2 = 2^(32*5) mod Q << 32 541// rk3 = 2^(32*15) mod Q << 32 542// rk4 = 2^(32*17) mod Q << 32 543// rk5 = 2^(32*3) mod Q << 32 544// rk6 = 2^(32*2) mod Q << 32 545// rk7 = floor(2^64/Q) 546// rk8 = Q 547 548rk1: .octa 0x06df0000000000002d56000000000000 549rk3: .octa 0x7cf50000000000009d9d000000000000 550rk5: .octa 0x13680000000000002d56000000000000 551rk7: .octa 0x000000018bb7000000000001f65a57f8 552rk9: .octa 0xbfd6000000000000ceae000000000000 553rk11: .octa 0x713c0000000000001e16000000000000 554rk13: .octa 0x80a6000000000000f7f9000000000000 555rk15: .octa 0xe658000000000000044c000000000000 556rk17: .octa 0xa497000000000000ad18000000000000 557rk19: .octa 0xe7b50000000000006ee3000000000000 558 559tbl_shf_table: 560// use these values for shift constants for the tbl/tbx instruction 561// different alignments result in values as shown: 562// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 563// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 564// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 565// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 566// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 567// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 568// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 569// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 570// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 571// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 572// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 573// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 574// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 575// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 576// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 577 578 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 579 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 580 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 581 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 582