1/* 2 * Calculate the checksum of data that is 16 byte aligned and a multiple of 3 * 16 bytes. 4 * 5 * The first step is to reduce it to 1024 bits. We do this in 8 parallel 6 * chunks in order to mask the latency of the vpmsum instructions. If we 7 * have more than 32 kB of data to checksum we repeat this step multiple 8 * times, passing in the previous 1024 bits. 9 * 10 * The next step is to reduce the 1024 bits to 64 bits. This step adds 11 * 32 bits of 0s to the end - this matches what a CRC does. We just 12 * calculate constants that land the data in this 32 bits. 13 * 14 * We then use fixed point Barrett reduction to compute a mod n over GF(2) 15 * for n = CRC using POWER8 instructions. We use x = 32. 16 * 17 * http://en.wikipedia.org/wiki/Barrett_reduction 18 * 19 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM 20 * 21 * This program is free software; you can redistribute it and/or 22 * modify it under the terms of the GNU General Public License 23 * as published by the Free Software Foundation; either version 24 * 2 of the License, or (at your option) any later version. 25 */ 26#include <asm/ppc_asm.h> 27#include <asm/ppc-opcode.h> 28 29 .section .rodata 30.balign 16 31 32.byteswap_constant: 33 /* byte reverse permute constant */ 34 .octa 0x0F0E0D0C0B0A09080706050403020100 35 36#define MAX_SIZE 32768 37.constants: 38 39 /* Reduce 262144 kbits to 1024 bits */ 40 /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ 41 .octa 0x00000000b6ca9e20000000009c37c408 42 43 /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ 44 .octa 0x00000000350249a800000001b51df26c 45 46 /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ 47 .octa 0x00000001862dac54000000000724b9d0 48 49 /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ 50 .octa 0x00000001d87fb48c00000001c00532fe 51 52 /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ 53 .octa 0x00000001f39b699e00000000f05a9362 54 55 /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ 56 .octa 0x0000000101da11b400000001e1007970 57 58 /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ 59 .octa 0x00000001cab571e000000000a57366ee 60 61 /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ 62 .octa 0x00000000c7020cfe0000000192011284 63 64 /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ 65 .octa 0x00000000cdaed1ae0000000162716d9a 66 67 /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ 68 .octa 0x00000001e804effc00000000cd97ecde 69 70 /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ 71 .octa 0x0000000077c3ea3a0000000058812bc0 72 73 /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ 74 .octa 0x0000000068df31b40000000088b8c12e 75 76 /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ 77 .octa 0x00000000b059b6c200000001230b234c 78 79 /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ 80 .octa 0x0000000145fb8ed800000001120b416e 81 82 /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ 83 .octa 0x00000000cbc0916800000001974aecb0 84 85 /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ 86 .octa 0x000000005ceeedc2000000008ee3f226 87 88 /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ 89 .octa 0x0000000047d74e8600000001089aba9a 90 91 /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ 92 .octa 0x00000001407e9e220000000065113872 93 94 /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ 95 .octa 0x00000001da967bda000000005c07ec10 96 97 /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ 98 .octa 0x000000006c8983680000000187590924 99 100 /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ 101 .octa 0x00000000f2d14c9800000000e35da7c6 102 103 /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ 104 .octa 0x00000001993c6ad4000000000415855a 105 106 /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ 107 .octa 0x000000014683d1ac0000000073617758 108 109 /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ 110 .octa 0x00000001a7c93e6c0000000176021d28 111 112 /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ 113 .octa 0x000000010211e90a00000001c358fd0a 114 115 /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ 116 .octa 0x000000001119403e00000001ff7a2c18 117 118 /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ 119 .octa 0x000000001c3261aa00000000f2d9f7e4 120 121 /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ 122 .octa 0x000000014e37a634000000016cf1f9c8 123 124 /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ 125 .octa 0x0000000073786c0c000000010af9279a 126 127 /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ 128 .octa 0x000000011dc037f80000000004f101e8 129 130 /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ 131 .octa 0x0000000031433dfc0000000070bcf184 132 133 /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ 134 .octa 0x000000009cde8348000000000a8de642 135 136 /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ 137 .octa 0x0000000038d3c2a60000000062ea130c 138 139 /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ 140 .octa 0x000000011b25f26000000001eb31cbb2 141 142 /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ 143 .octa 0x000000001629e6f00000000170783448 144 145 /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ 146 .octa 0x0000000160838b4c00000001a684b4c6 147 148 /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ 149 .octa 0x000000007a44011c00000000253ca5b4 150 151 /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ 152 .octa 0x00000000226f417a0000000057b4b1e2 153 154 /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ 155 .octa 0x0000000045eb2eb400000000b6bd084c 156 157 /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ 158 .octa 0x000000014459d70c0000000123c2d592 159 160 /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ 161 .octa 0x00000001d406ed8200000000159dafce 162 163 /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ 164 .octa 0x0000000160c8e1a80000000127e1a64e 165 166 /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ 167 .octa 0x0000000027ba80980000000056860754 168 169 /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ 170 .octa 0x000000006d92d01800000001e661aae8 171 172 /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ 173 .octa 0x000000012ed7e3f200000000f82c6166 174 175 /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ 176 .octa 0x000000002dc8778800000000c4f9c7ae 177 178 /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ 179 .octa 0x0000000018240bb80000000074203d20 180 181 /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ 182 .octa 0x000000001ad381580000000198173052 183 184 /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ 185 .octa 0x00000001396b78f200000001ce8aba54 186 187 /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ 188 .octa 0x000000011a68133400000001850d5d94 189 190 /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ 191 .octa 0x000000012104732e00000001d609239c 192 193 /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ 194 .octa 0x00000000a140d90c000000001595f048 195 196 /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ 197 .octa 0x00000001b7215eda0000000042ccee08 198 199 /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ 200 .octa 0x00000001aaf1df3c000000010a389d74 201 202 /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ 203 .octa 0x0000000029d15b8a000000012a840da6 204 205 /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ 206 .octa 0x00000000f1a96922000000001d181c0c 207 208 /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ 209 .octa 0x00000001ac80d03c0000000068b7d1f6 210 211 /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ 212 .octa 0x000000000f11d56a000000005b0f14fc 213 214 /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ 215 .octa 0x00000001f1c022a20000000179e9e730 216 217 /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ 218 .octa 0x0000000173d00ae200000001ce1368d6 219 220 /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ 221 .octa 0x00000001d4ffe4ac0000000112c3a84c 222 223 /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ 224 .octa 0x000000016edc5ae400000000de940fee 225 226 /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ 227 .octa 0x00000001f1a0214000000000fe896b7e 228 229 /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ 230 .octa 0x00000000ca0b28a000000001f797431c 231 232 /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ 233 .octa 0x00000001928e30a20000000053e989ba 234 235 /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ 236 .octa 0x0000000097b1b002000000003920cd16 237 238 /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ 239 .octa 0x00000000b15bf90600000001e6f579b8 240 241 /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ 242 .octa 0x00000000411c5d52000000007493cb0a 243 244 /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ 245 .octa 0x00000001c36f330000000001bdd376d8 246 247 /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ 248 .octa 0x00000001119227e0000000016badfee6 249 250 /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ 251 .octa 0x00000000114d47020000000071de5c58 252 253 /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ 254 .octa 0x00000000458b5b9800000000453f317c 255 256 /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ 257 .octa 0x000000012e31fb8e0000000121675cce 258 259 /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ 260 .octa 0x000000005cf619d800000001f409ee92 261 262 /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ 263 .octa 0x0000000063f4d8b200000000f36b9c88 264 265 /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ 266 .octa 0x000000004138dc8a0000000036b398f4 267 268 /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ 269 .octa 0x00000001d29ee8e000000001748f9adc 270 271 /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ 272 .octa 0x000000006a08ace800000001be94ec00 273 274 /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ 275 .octa 0x0000000127d4201000000000b74370d6 276 277 /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ 278 .octa 0x0000000019d76b6200000001174d0b98 279 280 /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ 281 .octa 0x00000001b1471f6e00000000befc06a4 282 283 /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ 284 .octa 0x00000001f64c19cc00000001ae125288 285 286 /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ 287 .octa 0x00000000003c0ea00000000095c19b34 288 289 /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ 290 .octa 0x000000014d73abf600000001a78496f2 291 292 /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ 293 .octa 0x00000001620eb84400000001ac5390a0 294 295 /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ 296 .octa 0x0000000147655048000000002a80ed6e 297 298 /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ 299 .octa 0x0000000067b5077e00000001fa9b0128 300 301 /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ 302 .octa 0x0000000010ffe20600000001ea94929e 303 304 /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ 305 .octa 0x000000000fee8f1e0000000125f4305c 306 307 /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ 308 .octa 0x00000001da26fbae00000001471e2002 309 310 /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ 311 .octa 0x00000001b3a8bd880000000132d2253a 312 313 /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ 314 .octa 0x00000000e8f3898e00000000f26b3592 315 316 /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ 317 .octa 0x00000000b0d0d28c00000000bc8b67b0 318 319 /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ 320 .octa 0x0000000030f2a798000000013a826ef2 321 322 /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ 323 .octa 0x000000000fba10020000000081482c84 324 325 /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ 326 .octa 0x00000000bdb9bd7200000000e77307c2 327 328 /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ 329 .octa 0x0000000075d3bf5a00000000d4a07ec8 330 331 /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ 332 .octa 0x00000000ef1f98a00000000017102100 333 334 /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ 335 .octa 0x00000000689c760200000000db406486 336 337 /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ 338 .octa 0x000000016d5fa5fe0000000192db7f88 339 340 /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ 341 .octa 0x00000001d0d2b9ca000000018bf67b1e 342 343 /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ 344 .octa 0x0000000041e7b470000000007c09163e 345 346 /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ 347 .octa 0x00000001cbb6495e000000000adac060 348 349 /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ 350 .octa 0x000000010052a0b000000000bd8316ae 351 352 /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ 353 .octa 0x00000001d8effb5c000000019f09ab54 354 355 /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ 356 .octa 0x00000001d969853c0000000125155542 357 358 /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ 359 .octa 0x00000000523ccce2000000018fdb5882 360 361 /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ 362 .octa 0x000000001e2436bc00000000e794b3f4 363 364 /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ 365 .octa 0x00000000ddd1c3a2000000016f9bb022 366 367 /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ 368 .octa 0x0000000019fcfe3800000000290c9978 369 370 /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ 371 .octa 0x00000001ce95db640000000083c0f350 372 373 /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ 374 .octa 0x00000000af5828060000000173ea6628 375 376 /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ 377 .octa 0x00000001006388f600000001c8b4e00a 378 379 /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ 380 .octa 0x0000000179eca00a00000000de95d6aa 381 382 /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ 383 .octa 0x0000000122410a6a000000010b7f7248 384 385 /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ 386 .octa 0x000000004288e87c00000001326e3a06 387 388 /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ 389 .octa 0x000000016c5490da00000000bb62c2e6 390 391 /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ 392 .octa 0x00000000d1c71f6e0000000156a4b2c2 393 394 /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ 395 .octa 0x00000001b4ce08a6000000011dfe763a 396 397 /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ 398 .octa 0x00000001466ba60c000000007bcca8e2 399 400 /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ 401 .octa 0x00000001f6c488a40000000186118faa 402 403 /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ 404 .octa 0x000000013bfb06820000000111a65a88 405 406 /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ 407 .octa 0x00000000690e9e54000000003565e1c4 408 409 /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ 410 .octa 0x00000000281346b6000000012ed02a82 411 412 /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ 413 .octa 0x000000015646402400000000c486ecfc 414 415 /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ 416 .octa 0x000000016063a8dc0000000001b951b2 417 418 /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ 419 .octa 0x0000000116a663620000000048143916 420 421 /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ 422 .octa 0x000000017e8aa4d200000001dc2ae124 423 424 /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ 425 .octa 0x00000001728eb10c00000001416c58d6 426 427 /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ 428 .octa 0x00000001b08fd7fa00000000a479744a 429 430 /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ 431 .octa 0x00000001092a16e80000000096ca3a26 432 433 /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ 434 .octa 0x00000000a505637c00000000ff223d4e 435 436 /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ 437 .octa 0x00000000d94869b2000000010e84da42 438 439 /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ 440 .octa 0x00000001c8b203ae00000001b61ba3d0 441 442 /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ 443 .octa 0x000000005704aea000000000680f2de8 444 445 /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ 446 .octa 0x000000012e295fa2000000008772a9a8 447 448 /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ 449 .octa 0x000000011d0908bc0000000155f295bc 450 451 /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ 452 .octa 0x0000000193ed97ea00000000595f9282 453 454 /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ 455 .octa 0x000000013a0f1c520000000164b1c25a 456 457 /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ 458 .octa 0x000000010c2c40c000000000fbd67c50 459 460 /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ 461 .octa 0x00000000ff6fac3e0000000096076268 462 463 /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ 464 .octa 0x000000017b3609c000000001d288e4cc 465 466 /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ 467 .octa 0x0000000088c8c92200000001eaac1bdc 468 469 /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ 470 .octa 0x00000001751baae600000001f1ea39e2 471 472 /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ 473 .octa 0x000000010795297200000001eb6506fc 474 475 /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ 476 .octa 0x0000000162b00abe000000010f806ffe 477 478 /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ 479 .octa 0x000000000d7b404c000000010408481e 480 481 /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ 482 .octa 0x00000000763b13d40000000188260534 483 484 /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ 485 .octa 0x00000000f6dc22d80000000058fc73e0 486 487 /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ 488 .octa 0x000000007daae06000000000391c59b8 489 490 /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ 491 .octa 0x000000013359ab7c000000018b638400 492 493 /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ 494 .octa 0x000000008add438a000000011738f5c4 495 496 /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ 497 .octa 0x00000001edbefdea000000008cf7c6da 498 499 /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ 500 .octa 0x000000004104e0f800000001ef97fb16 501 502 /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ 503 .octa 0x00000000b48a82220000000102130e20 504 505 /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ 506 .octa 0x00000001bcb4684400000000db968898 507 508 /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ 509 .octa 0x000000013293ce0a00000000b5047b5e 510 511 /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ 512 .octa 0x00000001710d0844000000010b90fdb2 513 514 /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ 515 .octa 0x0000000117907f6e000000004834a32e 516 517 /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ 518 .octa 0x0000000087ddf93e0000000059c8f2b0 519 520 /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ 521 .octa 0x000000005970e9b00000000122cec508 522 523 /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ 524 .octa 0x0000000185b2b7d0000000000a330cda 525 526 /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ 527 .octa 0x00000001dcee0efc000000014a47148c 528 529 /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ 530 .octa 0x0000000030da27220000000042c61cb8 531 532 /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ 533 .octa 0x000000012f925a180000000012fe6960 534 535 /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ 536 .octa 0x00000000dd2e357c00000000dbda2c20 537 538 /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ 539 .octa 0x00000000071c80de000000011122410c 540 541 /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ 542 .octa 0x000000011513140a00000000977b2070 543 544 /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ 545 .octa 0x00000001df876e8e000000014050438e 546 547 /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ 548 .octa 0x000000015f81d6ce0000000147c840e8 549 550 /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ 551 .octa 0x000000019dd94dbe00000001cc7c88ce 552 553 /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ 554 .octa 0x00000001373d206e00000001476b35a4 555 556 /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ 557 .octa 0x00000000668ccade000000013d52d508 558 559 /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ 560 .octa 0x00000001b192d268000000008e4be32e 561 562 /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ 563 .octa 0x00000000e30f3a7800000000024120fe 564 565 /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ 566 .octa 0x000000010ef1f7bc00000000ddecddb4 567 568 /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ 569 .octa 0x00000001f5ac738000000000d4d403bc 570 571 /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ 572 .octa 0x000000011822ea7000000001734b89aa 573 574 /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ 575 .octa 0x00000000c3a33848000000010e7a58d6 576 577 /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ 578 .octa 0x00000001bd151c2400000001f9f04e9c 579 580 /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ 581 .octa 0x0000000056002d7600000000b692225e 582 583 /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ 584 .octa 0x000000014657c4f4000000019b8d3f3e 585 586 /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ 587 .octa 0x0000000113742d7c00000001a874f11e 588 589 /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ 590 .octa 0x000000019c5920ba000000010d5a4254 591 592 /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ 593 .octa 0x000000005216d2d600000000bbb2f5d6 594 595 /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ 596 .octa 0x0000000136f5ad8a0000000179cc0e36 597 598 /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ 599 .octa 0x000000018b07beb600000001dca1da4a 600 601 /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ 602 .octa 0x00000000db1e93b000000000feb1a192 603 604 /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ 605 .octa 0x000000000b96fa3a00000000d1eeedd6 606 607 /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ 608 .octa 0x00000001d9968af0000000008fad9bb4 609 610 /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ 611 .octa 0x000000000e4a77a200000001884938e4 612 613 /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ 614 .octa 0x00000000508c2ac800000001bc2e9bc0 615 616 /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ 617 .octa 0x0000000021572a8000000001f9658a68 618 619 /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ 620 .octa 0x00000001b859daf2000000001b9224fc 621 622 /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ 623 .octa 0x000000016f7884740000000055b2fb84 624 625 /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ 626 .octa 0x00000001b438810e000000018b090348 627 628 /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ 629 .octa 0x0000000095ddc6f2000000011ccbd5ea 630 631 /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ 632 .octa 0x00000001d977c20c0000000007ae47f8 633 634 /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ 635 .octa 0x00000000ebedb99a0000000172acbec0 636 637 /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ 638 .octa 0x00000001df9e9e9200000001c6e3ff20 639 640 /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ 641 .octa 0x00000001a4a3f95200000000e1b38744 642 643 /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ 644 .octa 0x00000000e2f5122000000000791585b2 645 646 /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ 647 .octa 0x000000004aa01f3e00000000ac53b894 648 649 /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ 650 .octa 0x00000000b3e90a5800000001ed5f2cf4 651 652 /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ 653 .octa 0x000000000c9ca2aa00000001df48b2e0 654 655 /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ 656 .octa 0x000000015168231600000000049c1c62 657 658 /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ 659 .octa 0x0000000036fce78c000000017c460c12 660 661 /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ 662 .octa 0x000000009037dc10000000015be4da7e 663 664 /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ 665 .octa 0x00000000d3298582000000010f38f668 666 667 /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ 668 .octa 0x00000001b42e8ad60000000039f40a00 669 670 /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ 671 .octa 0x00000000142a983800000000bd4c10c4 672 673 /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ 674 .octa 0x0000000109c7f1900000000042db1d98 675 676 /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ 677 .octa 0x0000000056ff931000000001c905bae6 678 679 /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ 680 .octa 0x00000001594513aa00000000069d40ea 681 682 /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ 683 .octa 0x00000001e3b5b1e8000000008e4fbad0 684 685 /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ 686 .octa 0x000000011dd5fc080000000047bedd46 687 688 /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ 689 .octa 0x00000001675f0cc20000000026396bf8 690 691 /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ 692 .octa 0x00000000d1c8dd4400000000379beb92 693 694 /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ 695 .octa 0x0000000115ebd3d8000000000abae54a 696 697 /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ 698 .octa 0x00000001ecbd0dac0000000007e6a128 699 700 /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ 701 .octa 0x00000000cdf67af2000000000ade29d2 702 703 /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ 704 .octa 0x000000004c01ff4c00000000f974c45c 705 706 /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ 707 .octa 0x00000000f2d8657e00000000e77ac60a 708 709 /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ 710 .octa 0x000000006bae74c40000000145895816 711 712 /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ 713 .octa 0x0000000152af8aa00000000038e362be 714 715 /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ 716 .octa 0x0000000004663802000000007f991a64 717 718 /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ 719 .octa 0x00000001ab2f5afc00000000fa366d3a 720 721 /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ 722 .octa 0x0000000074a4ebd400000001a2bb34f0 723 724 /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ 725 .octa 0x00000001d7ab3a4c0000000028a9981e 726 727 /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ 728 .octa 0x00000001a8da60c600000001dbc672be 729 730 /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ 731 .octa 0x000000013cf6382000000000b04d77f6 732 733 /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ 734 .octa 0x00000000bec12e1e0000000124400d96 735 736 /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ 737 .octa 0x00000001c6368010000000014ca4b414 738 739 /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ 740 .octa 0x00000001e6e78758000000012fe2c938 741 742 /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ 743 .octa 0x000000008d7f2b3c00000001faed01e6 744 745 /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ 746 .octa 0x000000016b4a156e000000007e80ecfe 747 748 /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ 749 .octa 0x00000001c63cfeb60000000098daee94 750 751 /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ 752 .octa 0x000000015f902670000000010a04edea 753 754 /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ 755 .octa 0x00000001cd5de11e00000001c00b4524 756 757 /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ 758 .octa 0x000000001acaec540000000170296550 759 760 /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ 761 .octa 0x000000002bd0ca780000000181afaa48 762 763 /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ 764 .octa 0x0000000032d63d5c0000000185a31ffa 765 766 /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ 767 .octa 0x000000001c6d4e4c000000002469f608 768 769 /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ 770 .octa 0x0000000106a60b92000000006980102a 771 772 /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ 773 .octa 0x00000000d3855e120000000111ea9ca8 774 775 /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ 776 .octa 0x00000000e312563600000001bd1d29ce 777 778 /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ 779 .octa 0x000000009e8f7ea400000001b34b9580 780 781 /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ 782 .octa 0x00000001c82e562c000000003076054e 783 784 /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ 785 .octa 0x00000000ca9f09ce000000012a608ea4 786 787 /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ 788 .octa 0x00000000c63764e600000000784d05fe 789 790 /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ 791 .octa 0x0000000168d2e49e000000016ef0d82a 792 793 /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ 794 .octa 0x00000000e986c1480000000075bda454 795 796 /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ 797 .octa 0x00000000cfb65894000000003dc0a1c4 798 799 /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ 800 .octa 0x0000000111cadee400000000e9a5d8be 801 802 /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ 803 .octa 0x0000000171fb63ce00000001609bc4b4 804 805.short_constants: 806 807 /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ 808 /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ 809 .octa 0x7fec2963e5bf80485cf015c388e56f72 810 811 /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ 812 .octa 0x38e888d4844752a9963a18920246e2e6 813 814 /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ 815 .octa 0x42316c00730206ad419a441956993a31 816 817 /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ 818 .octa 0x543d5c543e65ddf9924752ba2b830011 819 820 /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ 821 .octa 0x78e87aaf56767c9255bd7f9518e4a304 822 823 /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ 824 .octa 0x8f68fcec1903da7f6d76739fe0553f1e 825 826 /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ 827 .octa 0x3f4840246791d588c133722b1fe0b5c3 828 829 /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ 830 .octa 0x34c96751b04de25a64b67ee0e55ef1f3 831 832 /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ 833 .octa 0x156c8e180b4a395b069db049b8fdb1e7 834 835 /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ 836 .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e 837 838 /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ 839 .octa 0x041d37768cd75659817cdc5119b29a35 840 841 /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ 842 .octa 0x3a0777818cfaa9651ce9d94b36c41f1c 843 844 /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ 845 .octa 0x0e148e8252377a554f256efcb82be955 846 847 /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ 848 .octa 0x9c25531d19e65ddeec1631edb2dea967 849 850 /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ 851 .octa 0x790606ff9957c0a65d27e147510ac59a 852 853 /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ 854 .octa 0x82f63b786ea2d55ca66805eb18b8ea18 855 856 857.barrett_constants: 858 /* 33 bit reflected Barrett constant m - (4^32)/n */ 859 .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ 860 /* 33 bit reflected Barrett constant n */ 861 .octa 0x00000000000000000000000105ec76f1 862 863 .text 864 865#if defined(__BIG_ENDIAN__) 866#define BYTESWAP_DATA 867#else 868#undef BYTESWAP_DATA 869#endif 870 871#define off16 r25 872#define off32 r26 873#define off48 r27 874#define off64 r28 875#define off80 r29 876#define off96 r30 877#define off112 r31 878 879#define const1 v24 880#define const2 v25 881 882#define byteswap v26 883#define mask_32bit v27 884#define mask_64bit v28 885#define zeroes v29 886 887#ifdef BYTESWAP_DATA 888#define VPERM(A, B, C, D) vperm A, B, C, D 889#else 890#define VPERM(A, B, C, D) 891#endif 892 893/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */ 894FUNC_START(__crc32c_vpmsum) 895 std r31,-8(r1) 896 std r30,-16(r1) 897 std r29,-24(r1) 898 std r28,-32(r1) 899 std r27,-40(r1) 900 std r26,-48(r1) 901 std r25,-56(r1) 902 903 li off16,16 904 li off32,32 905 li off48,48 906 li off64,64 907 li off80,80 908 li off96,96 909 li off112,112 910 li r0,0 911 912 /* Enough room for saving 10 non volatile VMX registers */ 913 subi r6,r1,56+10*16 914 subi r7,r1,56+2*16 915 916 stvx v20,0,r6 917 stvx v21,off16,r6 918 stvx v22,off32,r6 919 stvx v23,off48,r6 920 stvx v24,off64,r6 921 stvx v25,off80,r6 922 stvx v26,off96,r6 923 stvx v27,off112,r6 924 stvx v28,0,r7 925 stvx v29,off16,r7 926 927 mr r10,r3 928 929 vxor zeroes,zeroes,zeroes 930 vspltisw v0,-1 931 932 vsldoi mask_32bit,zeroes,v0,4 933 vsldoi mask_64bit,zeroes,v0,8 934 935 /* Get the initial value into v8 */ 936 vxor v8,v8,v8 937 MTVRD(v8, R3) 938 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ 939 940#ifdef BYTESWAP_DATA 941 addis r3,r2,.byteswap_constant@toc@ha 942 addi r3,r3,.byteswap_constant@toc@l 943 944 lvx byteswap,0,r3 945 addi r3,r3,16 946#endif 947 948 cmpdi r5,256 949 blt .Lshort 950 951 rldicr r6,r5,0,56 952 953 /* Checksum in blocks of MAX_SIZE */ 9541: lis r7,MAX_SIZE@h 955 ori r7,r7,MAX_SIZE@l 956 mr r9,r7 957 cmpd r6,r7 958 bgt 2f 959 mr r7,r6 9602: subf r6,r7,r6 961 962 /* our main loop does 128 bytes at a time */ 963 srdi r7,r7,7 964 965 /* 966 * Work out the offset into the constants table to start at. Each 967 * constant is 16 bytes, and it is used against 128 bytes of input 968 * data - 128 / 16 = 8 969 */ 970 sldi r8,r7,4 971 srdi r9,r9,3 972 subf r8,r8,r9 973 974 /* We reduce our final 128 bytes in a separate step */ 975 addi r7,r7,-1 976 mtctr r7 977 978 addis r3,r2,.constants@toc@ha 979 addi r3,r3,.constants@toc@l 980 981 /* Find the start of our constants */ 982 add r3,r3,r8 983 984 /* zero v0-v7 which will contain our checksums */ 985 vxor v0,v0,v0 986 vxor v1,v1,v1 987 vxor v2,v2,v2 988 vxor v3,v3,v3 989 vxor v4,v4,v4 990 vxor v5,v5,v5 991 vxor v6,v6,v6 992 vxor v7,v7,v7 993 994 lvx const1,0,r3 995 996 /* 997 * If we are looping back to consume more data we use the values 998 * already in v16-v23. 999 */ 1000 cmpdi r0,1 1001 beq 2f 1002 1003 /* First warm up pass */ 1004 lvx v16,0,r4 1005 lvx v17,off16,r4 1006 VPERM(v16,v16,v16,byteswap) 1007 VPERM(v17,v17,v17,byteswap) 1008 lvx v18,off32,r4 1009 lvx v19,off48,r4 1010 VPERM(v18,v18,v18,byteswap) 1011 VPERM(v19,v19,v19,byteswap) 1012 lvx v20,off64,r4 1013 lvx v21,off80,r4 1014 VPERM(v20,v20,v20,byteswap) 1015 VPERM(v21,v21,v21,byteswap) 1016 lvx v22,off96,r4 1017 lvx v23,off112,r4 1018 VPERM(v22,v22,v22,byteswap) 1019 VPERM(v23,v23,v23,byteswap) 1020 addi r4,r4,8*16 1021 1022 /* xor in initial value */ 1023 vxor v16,v16,v8 1024 10252: bdz .Lfirst_warm_up_done 1026 1027 addi r3,r3,16 1028 lvx const2,0,r3 1029 1030 /* Second warm up pass */ 1031 VPMSUMD(v8,v16,const1) 1032 lvx v16,0,r4 1033 VPERM(v16,v16,v16,byteswap) 1034 ori r2,r2,0 1035 1036 VPMSUMD(v9,v17,const1) 1037 lvx v17,off16,r4 1038 VPERM(v17,v17,v17,byteswap) 1039 ori r2,r2,0 1040 1041 VPMSUMD(v10,v18,const1) 1042 lvx v18,off32,r4 1043 VPERM(v18,v18,v18,byteswap) 1044 ori r2,r2,0 1045 1046 VPMSUMD(v11,v19,const1) 1047 lvx v19,off48,r4 1048 VPERM(v19,v19,v19,byteswap) 1049 ori r2,r2,0 1050 1051 VPMSUMD(v12,v20,const1) 1052 lvx v20,off64,r4 1053 VPERM(v20,v20,v20,byteswap) 1054 ori r2,r2,0 1055 1056 VPMSUMD(v13,v21,const1) 1057 lvx v21,off80,r4 1058 VPERM(v21,v21,v21,byteswap) 1059 ori r2,r2,0 1060 1061 VPMSUMD(v14,v22,const1) 1062 lvx v22,off96,r4 1063 VPERM(v22,v22,v22,byteswap) 1064 ori r2,r2,0 1065 1066 VPMSUMD(v15,v23,const1) 1067 lvx v23,off112,r4 1068 VPERM(v23,v23,v23,byteswap) 1069 1070 addi r4,r4,8*16 1071 1072 bdz .Lfirst_cool_down 1073 1074 /* 1075 * main loop. We modulo schedule it such that it takes three iterations 1076 * to complete - first iteration load, second iteration vpmsum, third 1077 * iteration xor. 1078 */ 1079 .balign 16 10804: lvx const1,0,r3 1081 addi r3,r3,16 1082 ori r2,r2,0 1083 1084 vxor v0,v0,v8 1085 VPMSUMD(v8,v16,const2) 1086 lvx v16,0,r4 1087 VPERM(v16,v16,v16,byteswap) 1088 ori r2,r2,0 1089 1090 vxor v1,v1,v9 1091 VPMSUMD(v9,v17,const2) 1092 lvx v17,off16,r4 1093 VPERM(v17,v17,v17,byteswap) 1094 ori r2,r2,0 1095 1096 vxor v2,v2,v10 1097 VPMSUMD(v10,v18,const2) 1098 lvx v18,off32,r4 1099 VPERM(v18,v18,v18,byteswap) 1100 ori r2,r2,0 1101 1102 vxor v3,v3,v11 1103 VPMSUMD(v11,v19,const2) 1104 lvx v19,off48,r4 1105 VPERM(v19,v19,v19,byteswap) 1106 lvx const2,0,r3 1107 ori r2,r2,0 1108 1109 vxor v4,v4,v12 1110 VPMSUMD(v12,v20,const1) 1111 lvx v20,off64,r4 1112 VPERM(v20,v20,v20,byteswap) 1113 ori r2,r2,0 1114 1115 vxor v5,v5,v13 1116 VPMSUMD(v13,v21,const1) 1117 lvx v21,off80,r4 1118 VPERM(v21,v21,v21,byteswap) 1119 ori r2,r2,0 1120 1121 vxor v6,v6,v14 1122 VPMSUMD(v14,v22,const1) 1123 lvx v22,off96,r4 1124 VPERM(v22,v22,v22,byteswap) 1125 ori r2,r2,0 1126 1127 vxor v7,v7,v15 1128 VPMSUMD(v15,v23,const1) 1129 lvx v23,off112,r4 1130 VPERM(v23,v23,v23,byteswap) 1131 1132 addi r4,r4,8*16 1133 1134 bdnz 4b 1135 1136.Lfirst_cool_down: 1137 /* First cool down pass */ 1138 lvx const1,0,r3 1139 addi r3,r3,16 1140 1141 vxor v0,v0,v8 1142 VPMSUMD(v8,v16,const1) 1143 ori r2,r2,0 1144 1145 vxor v1,v1,v9 1146 VPMSUMD(v9,v17,const1) 1147 ori r2,r2,0 1148 1149 vxor v2,v2,v10 1150 VPMSUMD(v10,v18,const1) 1151 ori r2,r2,0 1152 1153 vxor v3,v3,v11 1154 VPMSUMD(v11,v19,const1) 1155 ori r2,r2,0 1156 1157 vxor v4,v4,v12 1158 VPMSUMD(v12,v20,const1) 1159 ori r2,r2,0 1160 1161 vxor v5,v5,v13 1162 VPMSUMD(v13,v21,const1) 1163 ori r2,r2,0 1164 1165 vxor v6,v6,v14 1166 VPMSUMD(v14,v22,const1) 1167 ori r2,r2,0 1168 1169 vxor v7,v7,v15 1170 VPMSUMD(v15,v23,const1) 1171 ori r2,r2,0 1172 1173.Lsecond_cool_down: 1174 /* Second cool down pass */ 1175 vxor v0,v0,v8 1176 vxor v1,v1,v9 1177 vxor v2,v2,v10 1178 vxor v3,v3,v11 1179 vxor v4,v4,v12 1180 vxor v5,v5,v13 1181 vxor v6,v6,v14 1182 vxor v7,v7,v15 1183 1184 /* 1185 * vpmsumd produces a 96 bit result in the least significant bits 1186 * of the register. Since we are bit reflected we have to shift it 1187 * left 32 bits so it occupies the least significant bits in the 1188 * bit reflected domain. 1189 */ 1190 vsldoi v0,v0,zeroes,4 1191 vsldoi v1,v1,zeroes,4 1192 vsldoi v2,v2,zeroes,4 1193 vsldoi v3,v3,zeroes,4 1194 vsldoi v4,v4,zeroes,4 1195 vsldoi v5,v5,zeroes,4 1196 vsldoi v6,v6,zeroes,4 1197 vsldoi v7,v7,zeroes,4 1198 1199 /* xor with last 1024 bits */ 1200 lvx v8,0,r4 1201 lvx v9,off16,r4 1202 VPERM(v8,v8,v8,byteswap) 1203 VPERM(v9,v9,v9,byteswap) 1204 lvx v10,off32,r4 1205 lvx v11,off48,r4 1206 VPERM(v10,v10,v10,byteswap) 1207 VPERM(v11,v11,v11,byteswap) 1208 lvx v12,off64,r4 1209 lvx v13,off80,r4 1210 VPERM(v12,v12,v12,byteswap) 1211 VPERM(v13,v13,v13,byteswap) 1212 lvx v14,off96,r4 1213 lvx v15,off112,r4 1214 VPERM(v14,v14,v14,byteswap) 1215 VPERM(v15,v15,v15,byteswap) 1216 1217 addi r4,r4,8*16 1218 1219 vxor v16,v0,v8 1220 vxor v17,v1,v9 1221 vxor v18,v2,v10 1222 vxor v19,v3,v11 1223 vxor v20,v4,v12 1224 vxor v21,v5,v13 1225 vxor v22,v6,v14 1226 vxor v23,v7,v15 1227 1228 li r0,1 1229 cmpdi r6,0 1230 addi r6,r6,128 1231 bne 1b 1232 1233 /* Work out how many bytes we have left */ 1234 andi. r5,r5,127 1235 1236 /* Calculate where in the constant table we need to start */ 1237 subfic r6,r5,128 1238 add r3,r3,r6 1239 1240 /* How many 16 byte chunks are in the tail */ 1241 srdi r7,r5,4 1242 mtctr r7 1243 1244 /* 1245 * Reduce the previously calculated 1024 bits to 64 bits, shifting 1246 * 32 bits to include the trailing 32 bits of zeros 1247 */ 1248 lvx v0,0,r3 1249 lvx v1,off16,r3 1250 lvx v2,off32,r3 1251 lvx v3,off48,r3 1252 lvx v4,off64,r3 1253 lvx v5,off80,r3 1254 lvx v6,off96,r3 1255 lvx v7,off112,r3 1256 addi r3,r3,8*16 1257 1258 VPMSUMW(v0,v16,v0) 1259 VPMSUMW(v1,v17,v1) 1260 VPMSUMW(v2,v18,v2) 1261 VPMSUMW(v3,v19,v3) 1262 VPMSUMW(v4,v20,v4) 1263 VPMSUMW(v5,v21,v5) 1264 VPMSUMW(v6,v22,v6) 1265 VPMSUMW(v7,v23,v7) 1266 1267 /* Now reduce the tail (0 - 112 bytes) */ 1268 cmpdi r7,0 1269 beq 1f 1270 1271 lvx v16,0,r4 1272 lvx v17,0,r3 1273 VPERM(v16,v16,v16,byteswap) 1274 VPMSUMW(v16,v16,v17) 1275 vxor v0,v0,v16 1276 bdz 1f 1277 1278 lvx v16,off16,r4 1279 lvx v17,off16,r3 1280 VPERM(v16,v16,v16,byteswap) 1281 VPMSUMW(v16,v16,v17) 1282 vxor v0,v0,v16 1283 bdz 1f 1284 1285 lvx v16,off32,r4 1286 lvx v17,off32,r3 1287 VPERM(v16,v16,v16,byteswap) 1288 VPMSUMW(v16,v16,v17) 1289 vxor v0,v0,v16 1290 bdz 1f 1291 1292 lvx v16,off48,r4 1293 lvx v17,off48,r3 1294 VPERM(v16,v16,v16,byteswap) 1295 VPMSUMW(v16,v16,v17) 1296 vxor v0,v0,v16 1297 bdz 1f 1298 1299 lvx v16,off64,r4 1300 lvx v17,off64,r3 1301 VPERM(v16,v16,v16,byteswap) 1302 VPMSUMW(v16,v16,v17) 1303 vxor v0,v0,v16 1304 bdz 1f 1305 1306 lvx v16,off80,r4 1307 lvx v17,off80,r3 1308 VPERM(v16,v16,v16,byteswap) 1309 VPMSUMW(v16,v16,v17) 1310 vxor v0,v0,v16 1311 bdz 1f 1312 1313 lvx v16,off96,r4 1314 lvx v17,off96,r3 1315 VPERM(v16,v16,v16,byteswap) 1316 VPMSUMW(v16,v16,v17) 1317 vxor v0,v0,v16 1318 1319 /* Now xor all the parallel chunks together */ 13201: vxor v0,v0,v1 1321 vxor v2,v2,v3 1322 vxor v4,v4,v5 1323 vxor v6,v6,v7 1324 1325 vxor v0,v0,v2 1326 vxor v4,v4,v6 1327 1328 vxor v0,v0,v4 1329 1330.Lbarrett_reduction: 1331 /* Barrett constants */ 1332 addis r3,r2,.barrett_constants@toc@ha 1333 addi r3,r3,.barrett_constants@toc@l 1334 1335 lvx const1,0,r3 1336 lvx const2,off16,r3 1337 1338 vsldoi v1,v0,v0,8 1339 vxor v0,v0,v1 /* xor two 64 bit results together */ 1340 1341 /* shift left one bit */ 1342 vspltisb v1,1 1343 vsl v0,v0,v1 1344 1345 vand v0,v0,mask_64bit 1346 1347 /* 1348 * The reflected version of Barrett reduction. Instead of bit 1349 * reflecting our data (which is expensive to do), we bit reflect our 1350 * constants and our algorithm, which means the intermediate data in 1351 * our vector registers goes from 0-63 instead of 63-0. We can reflect 1352 * the algorithm because we don't carry in mod 2 arithmetic. 1353 */ 1354 vand v1,v0,mask_32bit /* bottom 32 bits of a */ 1355 VPMSUMD(v1,v1,const1) /* ma */ 1356 vand v1,v1,mask_32bit /* bottom 32bits of ma */ 1357 VPMSUMD(v1,v1,const2) /* qn */ 1358 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 1359 1360 /* 1361 * Since we are bit reflected, the result (ie the low 32 bits) is in 1362 * the high 32 bits. We just need to shift it left 4 bytes 1363 * V0 [ 0 1 X 3 ] 1364 * V0 [ 0 X 2 3 ] 1365 */ 1366 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ 1367 1368 /* Get it into r3 */ 1369 MFVRD(R3, v0) 1370 1371.Lout: 1372 subi r6,r1,56+10*16 1373 subi r7,r1,56+2*16 1374 1375 lvx v20,0,r6 1376 lvx v21,off16,r6 1377 lvx v22,off32,r6 1378 lvx v23,off48,r6 1379 lvx v24,off64,r6 1380 lvx v25,off80,r6 1381 lvx v26,off96,r6 1382 lvx v27,off112,r6 1383 lvx v28,0,r7 1384 lvx v29,off16,r7 1385 1386 ld r31,-8(r1) 1387 ld r30,-16(r1) 1388 ld r29,-24(r1) 1389 ld r28,-32(r1) 1390 ld r27,-40(r1) 1391 ld r26,-48(r1) 1392 ld r25,-56(r1) 1393 1394 blr 1395 1396.Lfirst_warm_up_done: 1397 lvx const1,0,r3 1398 addi r3,r3,16 1399 1400 VPMSUMD(v8,v16,const1) 1401 VPMSUMD(v9,v17,const1) 1402 VPMSUMD(v10,v18,const1) 1403 VPMSUMD(v11,v19,const1) 1404 VPMSUMD(v12,v20,const1) 1405 VPMSUMD(v13,v21,const1) 1406 VPMSUMD(v14,v22,const1) 1407 VPMSUMD(v15,v23,const1) 1408 1409 b .Lsecond_cool_down 1410 1411.Lshort: 1412 cmpdi r5,0 1413 beq .Lzero 1414 1415 addis r3,r2,.short_constants@toc@ha 1416 addi r3,r3,.short_constants@toc@l 1417 1418 /* Calculate where in the constant table we need to start */ 1419 subfic r6,r5,256 1420 add r3,r3,r6 1421 1422 /* How many 16 byte chunks? */ 1423 srdi r7,r5,4 1424 mtctr r7 1425 1426 vxor v19,v19,v19 1427 vxor v20,v20,v20 1428 1429 lvx v0,0,r4 1430 lvx v16,0,r3 1431 VPERM(v0,v0,v16,byteswap) 1432 vxor v0,v0,v8 /* xor in initial value */ 1433 VPMSUMW(v0,v0,v16) 1434 bdz .Lv0 1435 1436 lvx v1,off16,r4 1437 lvx v17,off16,r3 1438 VPERM(v1,v1,v17,byteswap) 1439 VPMSUMW(v1,v1,v17) 1440 bdz .Lv1 1441 1442 lvx v2,off32,r4 1443 lvx v16,off32,r3 1444 VPERM(v2,v2,v16,byteswap) 1445 VPMSUMW(v2,v2,v16) 1446 bdz .Lv2 1447 1448 lvx v3,off48,r4 1449 lvx v17,off48,r3 1450 VPERM(v3,v3,v17,byteswap) 1451 VPMSUMW(v3,v3,v17) 1452 bdz .Lv3 1453 1454 lvx v4,off64,r4 1455 lvx v16,off64,r3 1456 VPERM(v4,v4,v16,byteswap) 1457 VPMSUMW(v4,v4,v16) 1458 bdz .Lv4 1459 1460 lvx v5,off80,r4 1461 lvx v17,off80,r3 1462 VPERM(v5,v5,v17,byteswap) 1463 VPMSUMW(v5,v5,v17) 1464 bdz .Lv5 1465 1466 lvx v6,off96,r4 1467 lvx v16,off96,r3 1468 VPERM(v6,v6,v16,byteswap) 1469 VPMSUMW(v6,v6,v16) 1470 bdz .Lv6 1471 1472 lvx v7,off112,r4 1473 lvx v17,off112,r3 1474 VPERM(v7,v7,v17,byteswap) 1475 VPMSUMW(v7,v7,v17) 1476 bdz .Lv7 1477 1478 addi r3,r3,128 1479 addi r4,r4,128 1480 1481 lvx v8,0,r4 1482 lvx v16,0,r3 1483 VPERM(v8,v8,v16,byteswap) 1484 VPMSUMW(v8,v8,v16) 1485 bdz .Lv8 1486 1487 lvx v9,off16,r4 1488 lvx v17,off16,r3 1489 VPERM(v9,v9,v17,byteswap) 1490 VPMSUMW(v9,v9,v17) 1491 bdz .Lv9 1492 1493 lvx v10,off32,r4 1494 lvx v16,off32,r3 1495 VPERM(v10,v10,v16,byteswap) 1496 VPMSUMW(v10,v10,v16) 1497 bdz .Lv10 1498 1499 lvx v11,off48,r4 1500 lvx v17,off48,r3 1501 VPERM(v11,v11,v17,byteswap) 1502 VPMSUMW(v11,v11,v17) 1503 bdz .Lv11 1504 1505 lvx v12,off64,r4 1506 lvx v16,off64,r3 1507 VPERM(v12,v12,v16,byteswap) 1508 VPMSUMW(v12,v12,v16) 1509 bdz .Lv12 1510 1511 lvx v13,off80,r4 1512 lvx v17,off80,r3 1513 VPERM(v13,v13,v17,byteswap) 1514 VPMSUMW(v13,v13,v17) 1515 bdz .Lv13 1516 1517 lvx v14,off96,r4 1518 lvx v16,off96,r3 1519 VPERM(v14,v14,v16,byteswap) 1520 VPMSUMW(v14,v14,v16) 1521 bdz .Lv14 1522 1523 lvx v15,off112,r4 1524 lvx v17,off112,r3 1525 VPERM(v15,v15,v17,byteswap) 1526 VPMSUMW(v15,v15,v17) 1527 1528.Lv15: vxor v19,v19,v15 1529.Lv14: vxor v20,v20,v14 1530.Lv13: vxor v19,v19,v13 1531.Lv12: vxor v20,v20,v12 1532.Lv11: vxor v19,v19,v11 1533.Lv10: vxor v20,v20,v10 1534.Lv9: vxor v19,v19,v9 1535.Lv8: vxor v20,v20,v8 1536.Lv7: vxor v19,v19,v7 1537.Lv6: vxor v20,v20,v6 1538.Lv5: vxor v19,v19,v5 1539.Lv4: vxor v20,v20,v4 1540.Lv3: vxor v19,v19,v3 1541.Lv2: vxor v20,v20,v2 1542.Lv1: vxor v19,v19,v1 1543.Lv0: vxor v20,v20,v0 1544 1545 vxor v0,v19,v20 1546 1547 b .Lbarrett_reduction 1548 1549.Lzero: 1550 mr r3,r10 1551 b .Lout 1552 1553FUNC_END(__crc32_vpmsum) 1554