1/*
2 * Calculate the checksum of data that is 16 byte aligned and a multiple of
3 * 16 bytes.
4 *
5 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
6 * chunks in order to mask the latency of the vpmsum instructions. If we
7 * have more than 32 kB of data to checksum we repeat this step multiple
8 * times, passing in the previous 1024 bits.
9 *
10 * The next step is to reduce the 1024 bits to 64 bits. This step adds
11 * 32 bits of 0s to the end - this matches what a CRC does. We just
12 * calculate constants that land the data in this 32 bits.
13 *
14 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
15 * for n = CRC using POWER8 instructions. We use x = 32.
16 *
17 * http://en.wikipedia.org/wiki/Barrett_reduction
18 *
19 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
20 *
21 * This program is free software; you can redistribute it and/or
22 * modify it under the terms of the GNU General Public License
23 * as published by the Free Software Foundation; either version
24 * 2 of the License, or (at your option) any later version.
25 */
26#include <asm/ppc_asm.h>
27#include <asm/ppc-opcode.h>
28
29	.section	.rodata
30.balign 16
31
32.byteswap_constant:
33	/* byte reverse permute constant */
34	.octa 0x0F0E0D0C0B0A09080706050403020100
35
36#define MAX_SIZE	32768
37.constants:
38
39	/* Reduce 262144 kbits to 1024 bits */
40	/* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
41	.octa 0x00000000b6ca9e20000000009c37c408
42
43	/* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
44	.octa 0x00000000350249a800000001b51df26c
45
46	/* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
47	.octa 0x00000001862dac54000000000724b9d0
48
49	/* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
50	.octa 0x00000001d87fb48c00000001c00532fe
51
52	/* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
53	.octa 0x00000001f39b699e00000000f05a9362
54
55	/* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
56	.octa 0x0000000101da11b400000001e1007970
57
58	/* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
59	.octa 0x00000001cab571e000000000a57366ee
60
61	/* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
62	.octa 0x00000000c7020cfe0000000192011284
63
64	/* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
65	.octa 0x00000000cdaed1ae0000000162716d9a
66
67	/* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
68	.octa 0x00000001e804effc00000000cd97ecde
69
70	/* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
71	.octa 0x0000000077c3ea3a0000000058812bc0
72
73	/* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
74	.octa 0x0000000068df31b40000000088b8c12e
75
76	/* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
77	.octa 0x00000000b059b6c200000001230b234c
78
79	/* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
80	.octa 0x0000000145fb8ed800000001120b416e
81
82	/* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
83	.octa 0x00000000cbc0916800000001974aecb0
84
85	/* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
86	.octa 0x000000005ceeedc2000000008ee3f226
87
88	/* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
89	.octa 0x0000000047d74e8600000001089aba9a
90
91	/* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
92	.octa 0x00000001407e9e220000000065113872
93
94	/* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
95	.octa 0x00000001da967bda000000005c07ec10
96
97	/* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
98	.octa 0x000000006c8983680000000187590924
99
100	/* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
101	.octa 0x00000000f2d14c9800000000e35da7c6
102
103	/* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
104	.octa 0x00000001993c6ad4000000000415855a
105
106	/* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
107	.octa 0x000000014683d1ac0000000073617758
108
109	/* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
110	.octa 0x00000001a7c93e6c0000000176021d28
111
112	/* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
113	.octa 0x000000010211e90a00000001c358fd0a
114
115	/* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
116	.octa 0x000000001119403e00000001ff7a2c18
117
118	/* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
119	.octa 0x000000001c3261aa00000000f2d9f7e4
120
121	/* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
122	.octa 0x000000014e37a634000000016cf1f9c8
123
124	/* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
125	.octa 0x0000000073786c0c000000010af9279a
126
127	/* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
128	.octa 0x000000011dc037f80000000004f101e8
129
130	/* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
131	.octa 0x0000000031433dfc0000000070bcf184
132
133	/* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
134	.octa 0x000000009cde8348000000000a8de642
135
136	/* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
137	.octa 0x0000000038d3c2a60000000062ea130c
138
139	/* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
140	.octa 0x000000011b25f26000000001eb31cbb2
141
142	/* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
143	.octa 0x000000001629e6f00000000170783448
144
145	/* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
146	.octa 0x0000000160838b4c00000001a684b4c6
147
148	/* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
149	.octa 0x000000007a44011c00000000253ca5b4
150
151	/* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
152	.octa 0x00000000226f417a0000000057b4b1e2
153
154	/* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
155	.octa 0x0000000045eb2eb400000000b6bd084c
156
157	/* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
158	.octa 0x000000014459d70c0000000123c2d592
159
160	/* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
161	.octa 0x00000001d406ed8200000000159dafce
162
163	/* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
164	.octa 0x0000000160c8e1a80000000127e1a64e
165
166	/* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
167	.octa 0x0000000027ba80980000000056860754
168
169	/* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
170	.octa 0x000000006d92d01800000001e661aae8
171
172	/* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
173	.octa 0x000000012ed7e3f200000000f82c6166
174
175	/* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
176	.octa 0x000000002dc8778800000000c4f9c7ae
177
178	/* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
179	.octa 0x0000000018240bb80000000074203d20
180
181	/* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
182	.octa 0x000000001ad381580000000198173052
183
184	/* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
185	.octa 0x00000001396b78f200000001ce8aba54
186
187	/* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
188	.octa 0x000000011a68133400000001850d5d94
189
190	/* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
191	.octa 0x000000012104732e00000001d609239c
192
193	/* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
194	.octa 0x00000000a140d90c000000001595f048
195
196	/* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
197	.octa 0x00000001b7215eda0000000042ccee08
198
199	/* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
200	.octa 0x00000001aaf1df3c000000010a389d74
201
202	/* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
203	.octa 0x0000000029d15b8a000000012a840da6
204
205	/* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
206	.octa 0x00000000f1a96922000000001d181c0c
207
208	/* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
209	.octa 0x00000001ac80d03c0000000068b7d1f6
210
211	/* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
212	.octa 0x000000000f11d56a000000005b0f14fc
213
214	/* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
215	.octa 0x00000001f1c022a20000000179e9e730
216
217	/* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
218	.octa 0x0000000173d00ae200000001ce1368d6
219
220	/* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
221	.octa 0x00000001d4ffe4ac0000000112c3a84c
222
223	/* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
224	.octa 0x000000016edc5ae400000000de940fee
225
226	/* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
227	.octa 0x00000001f1a0214000000000fe896b7e
228
229	/* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
230	.octa 0x00000000ca0b28a000000001f797431c
231
232	/* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
233	.octa 0x00000001928e30a20000000053e989ba
234
235	/* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
236	.octa 0x0000000097b1b002000000003920cd16
237
238	/* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
239	.octa 0x00000000b15bf90600000001e6f579b8
240
241	/* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
242	.octa 0x00000000411c5d52000000007493cb0a
243
244	/* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
245	.octa 0x00000001c36f330000000001bdd376d8
246
247	/* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
248	.octa 0x00000001119227e0000000016badfee6
249
250	/* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
251	.octa 0x00000000114d47020000000071de5c58
252
253	/* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
254	.octa 0x00000000458b5b9800000000453f317c
255
256	/* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
257	.octa 0x000000012e31fb8e0000000121675cce
258
259	/* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
260	.octa 0x000000005cf619d800000001f409ee92
261
262	/* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
263	.octa 0x0000000063f4d8b200000000f36b9c88
264
265	/* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
266	.octa 0x000000004138dc8a0000000036b398f4
267
268	/* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
269	.octa 0x00000001d29ee8e000000001748f9adc
270
271	/* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
272	.octa 0x000000006a08ace800000001be94ec00
273
274	/* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
275	.octa 0x0000000127d4201000000000b74370d6
276
277	/* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
278	.octa 0x0000000019d76b6200000001174d0b98
279
280	/* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
281	.octa 0x00000001b1471f6e00000000befc06a4
282
283	/* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
284	.octa 0x00000001f64c19cc00000001ae125288
285
286	/* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
287	.octa 0x00000000003c0ea00000000095c19b34
288
289	/* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
290	.octa 0x000000014d73abf600000001a78496f2
291
292	/* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
293	.octa 0x00000001620eb84400000001ac5390a0
294
295	/* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
296	.octa 0x0000000147655048000000002a80ed6e
297
298	/* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
299	.octa 0x0000000067b5077e00000001fa9b0128
300
301	/* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
302	.octa 0x0000000010ffe20600000001ea94929e
303
304	/* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
305	.octa 0x000000000fee8f1e0000000125f4305c
306
307	/* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
308	.octa 0x00000001da26fbae00000001471e2002
309
310	/* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
311	.octa 0x00000001b3a8bd880000000132d2253a
312
313	/* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
314	.octa 0x00000000e8f3898e00000000f26b3592
315
316	/* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
317	.octa 0x00000000b0d0d28c00000000bc8b67b0
318
319	/* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
320	.octa 0x0000000030f2a798000000013a826ef2
321
322	/* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
323	.octa 0x000000000fba10020000000081482c84
324
325	/* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
326	.octa 0x00000000bdb9bd7200000000e77307c2
327
328	/* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
329	.octa 0x0000000075d3bf5a00000000d4a07ec8
330
331	/* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
332	.octa 0x00000000ef1f98a00000000017102100
333
334	/* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
335	.octa 0x00000000689c760200000000db406486
336
337	/* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
338	.octa 0x000000016d5fa5fe0000000192db7f88
339
340	/* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
341	.octa 0x00000001d0d2b9ca000000018bf67b1e
342
343	/* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
344	.octa 0x0000000041e7b470000000007c09163e
345
346	/* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
347	.octa 0x00000001cbb6495e000000000adac060
348
349	/* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
350	.octa 0x000000010052a0b000000000bd8316ae
351
352	/* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
353	.octa 0x00000001d8effb5c000000019f09ab54
354
355	/* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
356	.octa 0x00000001d969853c0000000125155542
357
358	/* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
359	.octa 0x00000000523ccce2000000018fdb5882
360
361	/* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
362	.octa 0x000000001e2436bc00000000e794b3f4
363
364	/* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
365	.octa 0x00000000ddd1c3a2000000016f9bb022
366
367	/* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
368	.octa 0x0000000019fcfe3800000000290c9978
369
370	/* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
371	.octa 0x00000001ce95db640000000083c0f350
372
373	/* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
374	.octa 0x00000000af5828060000000173ea6628
375
376	/* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
377	.octa 0x00000001006388f600000001c8b4e00a
378
379	/* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
380	.octa 0x0000000179eca00a00000000de95d6aa
381
382	/* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
383	.octa 0x0000000122410a6a000000010b7f7248
384
385	/* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
386	.octa 0x000000004288e87c00000001326e3a06
387
388	/* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
389	.octa 0x000000016c5490da00000000bb62c2e6
390
391	/* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
392	.octa 0x00000000d1c71f6e0000000156a4b2c2
393
394	/* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
395	.octa 0x00000001b4ce08a6000000011dfe763a
396
397	/* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
398	.octa 0x00000001466ba60c000000007bcca8e2
399
400	/* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
401	.octa 0x00000001f6c488a40000000186118faa
402
403	/* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
404	.octa 0x000000013bfb06820000000111a65a88
405
406	/* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
407	.octa 0x00000000690e9e54000000003565e1c4
408
409	/* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
410	.octa 0x00000000281346b6000000012ed02a82
411
412	/* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
413	.octa 0x000000015646402400000000c486ecfc
414
415	/* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
416	.octa 0x000000016063a8dc0000000001b951b2
417
418	/* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
419	.octa 0x0000000116a663620000000048143916
420
421	/* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
422	.octa 0x000000017e8aa4d200000001dc2ae124
423
424	/* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
425	.octa 0x00000001728eb10c00000001416c58d6
426
427	/* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
428	.octa 0x00000001b08fd7fa00000000a479744a
429
430	/* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
431	.octa 0x00000001092a16e80000000096ca3a26
432
433	/* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
434	.octa 0x00000000a505637c00000000ff223d4e
435
436	/* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
437	.octa 0x00000000d94869b2000000010e84da42
438
439	/* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
440	.octa 0x00000001c8b203ae00000001b61ba3d0
441
442	/* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
443	.octa 0x000000005704aea000000000680f2de8
444
445	/* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
446	.octa 0x000000012e295fa2000000008772a9a8
447
448	/* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
449	.octa 0x000000011d0908bc0000000155f295bc
450
451	/* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
452	.octa 0x0000000193ed97ea00000000595f9282
453
454	/* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
455	.octa 0x000000013a0f1c520000000164b1c25a
456
457	/* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
458	.octa 0x000000010c2c40c000000000fbd67c50
459
460	/* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
461	.octa 0x00000000ff6fac3e0000000096076268
462
463	/* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
464	.octa 0x000000017b3609c000000001d288e4cc
465
466	/* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
467	.octa 0x0000000088c8c92200000001eaac1bdc
468
469	/* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
470	.octa 0x00000001751baae600000001f1ea39e2
471
472	/* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
473	.octa 0x000000010795297200000001eb6506fc
474
475	/* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
476	.octa 0x0000000162b00abe000000010f806ffe
477
478	/* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
479	.octa 0x000000000d7b404c000000010408481e
480
481	/* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
482	.octa 0x00000000763b13d40000000188260534
483
484	/* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
485	.octa 0x00000000f6dc22d80000000058fc73e0
486
487	/* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
488	.octa 0x000000007daae06000000000391c59b8
489
490	/* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
491	.octa 0x000000013359ab7c000000018b638400
492
493	/* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
494	.octa 0x000000008add438a000000011738f5c4
495
496	/* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
497	.octa 0x00000001edbefdea000000008cf7c6da
498
499	/* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
500	.octa 0x000000004104e0f800000001ef97fb16
501
502	/* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
503	.octa 0x00000000b48a82220000000102130e20
504
505	/* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
506	.octa 0x00000001bcb4684400000000db968898
507
508	/* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
509	.octa 0x000000013293ce0a00000000b5047b5e
510
511	/* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
512	.octa 0x00000001710d0844000000010b90fdb2
513
514	/* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
515	.octa 0x0000000117907f6e000000004834a32e
516
517	/* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
518	.octa 0x0000000087ddf93e0000000059c8f2b0
519
520	/* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
521	.octa 0x000000005970e9b00000000122cec508
522
523	/* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
524	.octa 0x0000000185b2b7d0000000000a330cda
525
526	/* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
527	.octa 0x00000001dcee0efc000000014a47148c
528
529	/* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
530	.octa 0x0000000030da27220000000042c61cb8
531
532	/* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
533	.octa 0x000000012f925a180000000012fe6960
534
535	/* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
536	.octa 0x00000000dd2e357c00000000dbda2c20
537
538	/* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
539	.octa 0x00000000071c80de000000011122410c
540
541	/* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
542	.octa 0x000000011513140a00000000977b2070
543
544	/* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
545	.octa 0x00000001df876e8e000000014050438e
546
547	/* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
548	.octa 0x000000015f81d6ce0000000147c840e8
549
550	/* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
551	.octa 0x000000019dd94dbe00000001cc7c88ce
552
553	/* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
554	.octa 0x00000001373d206e00000001476b35a4
555
556	/* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
557	.octa 0x00000000668ccade000000013d52d508
558
559	/* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
560	.octa 0x00000001b192d268000000008e4be32e
561
562	/* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
563	.octa 0x00000000e30f3a7800000000024120fe
564
565	/* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
566	.octa 0x000000010ef1f7bc00000000ddecddb4
567
568	/* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
569	.octa 0x00000001f5ac738000000000d4d403bc
570
571	/* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
572	.octa 0x000000011822ea7000000001734b89aa
573
574	/* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
575	.octa 0x00000000c3a33848000000010e7a58d6
576
577	/* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
578	.octa 0x00000001bd151c2400000001f9f04e9c
579
580	/* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
581	.octa 0x0000000056002d7600000000b692225e
582
583	/* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
584	.octa 0x000000014657c4f4000000019b8d3f3e
585
586	/* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
587	.octa 0x0000000113742d7c00000001a874f11e
588
589	/* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
590	.octa 0x000000019c5920ba000000010d5a4254
591
592	/* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
593	.octa 0x000000005216d2d600000000bbb2f5d6
594
595	/* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
596	.octa 0x0000000136f5ad8a0000000179cc0e36
597
598	/* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
599	.octa 0x000000018b07beb600000001dca1da4a
600
601	/* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
602	.octa 0x00000000db1e93b000000000feb1a192
603
604	/* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
605	.octa 0x000000000b96fa3a00000000d1eeedd6
606
607	/* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
608	.octa 0x00000001d9968af0000000008fad9bb4
609
610	/* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
611	.octa 0x000000000e4a77a200000001884938e4
612
613	/* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
614	.octa 0x00000000508c2ac800000001bc2e9bc0
615
616	/* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
617	.octa 0x0000000021572a8000000001f9658a68
618
619	/* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
620	.octa 0x00000001b859daf2000000001b9224fc
621
622	/* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
623	.octa 0x000000016f7884740000000055b2fb84
624
625	/* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
626	.octa 0x00000001b438810e000000018b090348
627
628	/* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
629	.octa 0x0000000095ddc6f2000000011ccbd5ea
630
631	/* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
632	.octa 0x00000001d977c20c0000000007ae47f8
633
634	/* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
635	.octa 0x00000000ebedb99a0000000172acbec0
636
637	/* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
638	.octa 0x00000001df9e9e9200000001c6e3ff20
639
640	/* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
641	.octa 0x00000001a4a3f95200000000e1b38744
642
643	/* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
644	.octa 0x00000000e2f5122000000000791585b2
645
646	/* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
647	.octa 0x000000004aa01f3e00000000ac53b894
648
649	/* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
650	.octa 0x00000000b3e90a5800000001ed5f2cf4
651
652	/* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
653	.octa 0x000000000c9ca2aa00000001df48b2e0
654
655	/* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
656	.octa 0x000000015168231600000000049c1c62
657
658	/* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
659	.octa 0x0000000036fce78c000000017c460c12
660
661	/* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
662	.octa 0x000000009037dc10000000015be4da7e
663
664	/* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
665	.octa 0x00000000d3298582000000010f38f668
666
667	/* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
668	.octa 0x00000001b42e8ad60000000039f40a00
669
670	/* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
671	.octa 0x00000000142a983800000000bd4c10c4
672
673	/* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
674	.octa 0x0000000109c7f1900000000042db1d98
675
676	/* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
677	.octa 0x0000000056ff931000000001c905bae6
678
679	/* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
680	.octa 0x00000001594513aa00000000069d40ea
681
682	/* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
683	.octa 0x00000001e3b5b1e8000000008e4fbad0
684
685	/* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
686	.octa 0x000000011dd5fc080000000047bedd46
687
688	/* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
689	.octa 0x00000001675f0cc20000000026396bf8
690
691	/* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
692	.octa 0x00000000d1c8dd4400000000379beb92
693
694	/* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
695	.octa 0x0000000115ebd3d8000000000abae54a
696
697	/* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
698	.octa 0x00000001ecbd0dac0000000007e6a128
699
700	/* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
701	.octa 0x00000000cdf67af2000000000ade29d2
702
703	/* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
704	.octa 0x000000004c01ff4c00000000f974c45c
705
706	/* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
707	.octa 0x00000000f2d8657e00000000e77ac60a
708
709	/* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
710	.octa 0x000000006bae74c40000000145895816
711
712	/* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
713	.octa 0x0000000152af8aa00000000038e362be
714
715	/* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
716	.octa 0x0000000004663802000000007f991a64
717
718	/* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
719	.octa 0x00000001ab2f5afc00000000fa366d3a
720
721	/* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
722	.octa 0x0000000074a4ebd400000001a2bb34f0
723
724	/* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
725	.octa 0x00000001d7ab3a4c0000000028a9981e
726
727	/* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
728	.octa 0x00000001a8da60c600000001dbc672be
729
730	/* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
731	.octa 0x000000013cf6382000000000b04d77f6
732
733	/* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
734	.octa 0x00000000bec12e1e0000000124400d96
735
736	/* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
737	.octa 0x00000001c6368010000000014ca4b414
738
739	/* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
740	.octa 0x00000001e6e78758000000012fe2c938
741
742	/* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
743	.octa 0x000000008d7f2b3c00000001faed01e6
744
745	/* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
746	.octa 0x000000016b4a156e000000007e80ecfe
747
748	/* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
749	.octa 0x00000001c63cfeb60000000098daee94
750
751	/* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
752	.octa 0x000000015f902670000000010a04edea
753
754	/* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
755	.octa 0x00000001cd5de11e00000001c00b4524
756
757	/* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
758	.octa 0x000000001acaec540000000170296550
759
760	/* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
761	.octa 0x000000002bd0ca780000000181afaa48
762
763	/* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
764	.octa 0x0000000032d63d5c0000000185a31ffa
765
766	/* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
767	.octa 0x000000001c6d4e4c000000002469f608
768
769	/* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
770	.octa 0x0000000106a60b92000000006980102a
771
772	/* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
773	.octa 0x00000000d3855e120000000111ea9ca8
774
775	/* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
776	.octa 0x00000000e312563600000001bd1d29ce
777
778	/* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
779	.octa 0x000000009e8f7ea400000001b34b9580
780
781	/* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
782	.octa 0x00000001c82e562c000000003076054e
783
784	/* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
785	.octa 0x00000000ca9f09ce000000012a608ea4
786
787	/* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
788	.octa 0x00000000c63764e600000000784d05fe
789
790	/* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
791	.octa 0x0000000168d2e49e000000016ef0d82a
792
793	/* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
794	.octa 0x00000000e986c1480000000075bda454
795
796	/* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
797	.octa 0x00000000cfb65894000000003dc0a1c4
798
799	/* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
800	.octa 0x0000000111cadee400000000e9a5d8be
801
802	/* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
803	.octa 0x0000000171fb63ce00000001609bc4b4
804
805.short_constants:
806
807	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
808	/* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
809	.octa 0x7fec2963e5bf80485cf015c388e56f72
810
811	/* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
812	.octa 0x38e888d4844752a9963a18920246e2e6
813
814	/* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
815	.octa 0x42316c00730206ad419a441956993a31
816
817	/* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
818	.octa 0x543d5c543e65ddf9924752ba2b830011
819
820	/* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
821	.octa 0x78e87aaf56767c9255bd7f9518e4a304
822
823	/* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
824	.octa 0x8f68fcec1903da7f6d76739fe0553f1e
825
826	/* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
827	.octa 0x3f4840246791d588c133722b1fe0b5c3
828
829	/* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
830	.octa 0x34c96751b04de25a64b67ee0e55ef1f3
831
832	/* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
833	.octa 0x156c8e180b4a395b069db049b8fdb1e7
834
835	/* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
836	.octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
837
838	/* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
839	.octa 0x041d37768cd75659817cdc5119b29a35
840
841	/* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
842	.octa 0x3a0777818cfaa9651ce9d94b36c41f1c
843
844	/* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
845	.octa 0x0e148e8252377a554f256efcb82be955
846
847	/* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
848	.octa 0x9c25531d19e65ddeec1631edb2dea967
849
850	/* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
851	.octa 0x790606ff9957c0a65d27e147510ac59a
852
853	/* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
854	.octa 0x82f63b786ea2d55ca66805eb18b8ea18
855
856
857.barrett_constants:
858	/* 33 bit reflected Barrett constant m - (4^32)/n */
859	.octa 0x000000000000000000000000dea713f1	/* x^64 div p(x)` */
860	/* 33 bit reflected Barrett constant n */
861	.octa 0x00000000000000000000000105ec76f1
862
863	.text
864
865#if defined(__BIG_ENDIAN__)
866#define BYTESWAP_DATA
867#else
868#undef BYTESWAP_DATA
869#endif
870
871#define off16		r25
872#define off32		r26
873#define off48		r27
874#define off64		r28
875#define off80		r29
876#define off96		r30
877#define off112		r31
878
879#define const1		v24
880#define const2		v25
881
882#define byteswap	v26
883#define	mask_32bit	v27
884#define	mask_64bit	v28
885#define zeroes		v29
886
887#ifdef BYTESWAP_DATA
888#define VPERM(A, B, C, D) vperm	A, B, C, D
889#else
890#define VPERM(A, B, C, D)
891#endif
892
893/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
894FUNC_START(__crc32c_vpmsum)
895	std	r31,-8(r1)
896	std	r30,-16(r1)
897	std	r29,-24(r1)
898	std	r28,-32(r1)
899	std	r27,-40(r1)
900	std	r26,-48(r1)
901	std	r25,-56(r1)
902
903	li	off16,16
904	li	off32,32
905	li	off48,48
906	li	off64,64
907	li	off80,80
908	li	off96,96
909	li	off112,112
910	li	r0,0
911
912	/* Enough room for saving 10 non volatile VMX registers */
913	subi	r6,r1,56+10*16
914	subi	r7,r1,56+2*16
915
916	stvx	v20,0,r6
917	stvx	v21,off16,r6
918	stvx	v22,off32,r6
919	stvx	v23,off48,r6
920	stvx	v24,off64,r6
921	stvx	v25,off80,r6
922	stvx	v26,off96,r6
923	stvx	v27,off112,r6
924	stvx	v28,0,r7
925	stvx	v29,off16,r7
926
927	mr	r10,r3
928
929	vxor	zeroes,zeroes,zeroes
930	vspltisw v0,-1
931
932	vsldoi	mask_32bit,zeroes,v0,4
933	vsldoi	mask_64bit,zeroes,v0,8
934
935	/* Get the initial value into v8 */
936	vxor	v8,v8,v8
937	MTVRD(v8, R3)
938	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
939
940#ifdef BYTESWAP_DATA
941	addis	r3,r2,.byteswap_constant@toc@ha
942	addi	r3,r3,.byteswap_constant@toc@l
943
944	lvx	byteswap,0,r3
945	addi	r3,r3,16
946#endif
947
948	cmpdi	r5,256
949	blt	.Lshort
950
951	rldicr	r6,r5,0,56
952
953	/* Checksum in blocks of MAX_SIZE */
9541:	lis	r7,MAX_SIZE@h
955	ori	r7,r7,MAX_SIZE@l
956	mr	r9,r7
957	cmpd	r6,r7
958	bgt	2f
959	mr	r7,r6
9602:	subf	r6,r7,r6
961
962	/* our main loop does 128 bytes at a time */
963	srdi	r7,r7,7
964
965	/*
966	 * Work out the offset into the constants table to start at. Each
967	 * constant is 16 bytes, and it is used against 128 bytes of input
968	 * data - 128 / 16 = 8
969	 */
970	sldi	r8,r7,4
971	srdi	r9,r9,3
972	subf	r8,r8,r9
973
974	/* We reduce our final 128 bytes in a separate step */
975	addi	r7,r7,-1
976	mtctr	r7
977
978	addis	r3,r2,.constants@toc@ha
979	addi	r3,r3,.constants@toc@l
980
981	/* Find the start of our constants */
982	add	r3,r3,r8
983
984	/* zero v0-v7 which will contain our checksums */
985	vxor	v0,v0,v0
986	vxor	v1,v1,v1
987	vxor	v2,v2,v2
988	vxor	v3,v3,v3
989	vxor	v4,v4,v4
990	vxor	v5,v5,v5
991	vxor	v6,v6,v6
992	vxor	v7,v7,v7
993
994	lvx	const1,0,r3
995
996	/*
997	 * If we are looping back to consume more data we use the values
998	 * already in v16-v23.
999	 */
1000	cmpdi	r0,1
1001	beq	2f
1002
1003	/* First warm up pass */
1004	lvx	v16,0,r4
1005	lvx	v17,off16,r4
1006	VPERM(v16,v16,v16,byteswap)
1007	VPERM(v17,v17,v17,byteswap)
1008	lvx	v18,off32,r4
1009	lvx	v19,off48,r4
1010	VPERM(v18,v18,v18,byteswap)
1011	VPERM(v19,v19,v19,byteswap)
1012	lvx	v20,off64,r4
1013	lvx	v21,off80,r4
1014	VPERM(v20,v20,v20,byteswap)
1015	VPERM(v21,v21,v21,byteswap)
1016	lvx	v22,off96,r4
1017	lvx	v23,off112,r4
1018	VPERM(v22,v22,v22,byteswap)
1019	VPERM(v23,v23,v23,byteswap)
1020	addi	r4,r4,8*16
1021
1022	/* xor in initial value */
1023	vxor	v16,v16,v8
1024
10252:	bdz	.Lfirst_warm_up_done
1026
1027	addi	r3,r3,16
1028	lvx	const2,0,r3
1029
1030	/* Second warm up pass */
1031	VPMSUMD(v8,v16,const1)
1032	lvx	v16,0,r4
1033	VPERM(v16,v16,v16,byteswap)
1034	ori	r2,r2,0
1035
1036	VPMSUMD(v9,v17,const1)
1037	lvx	v17,off16,r4
1038	VPERM(v17,v17,v17,byteswap)
1039	ori	r2,r2,0
1040
1041	VPMSUMD(v10,v18,const1)
1042	lvx	v18,off32,r4
1043	VPERM(v18,v18,v18,byteswap)
1044	ori	r2,r2,0
1045
1046	VPMSUMD(v11,v19,const1)
1047	lvx	v19,off48,r4
1048	VPERM(v19,v19,v19,byteswap)
1049	ori	r2,r2,0
1050
1051	VPMSUMD(v12,v20,const1)
1052	lvx	v20,off64,r4
1053	VPERM(v20,v20,v20,byteswap)
1054	ori	r2,r2,0
1055
1056	VPMSUMD(v13,v21,const1)
1057	lvx	v21,off80,r4
1058	VPERM(v21,v21,v21,byteswap)
1059	ori	r2,r2,0
1060
1061	VPMSUMD(v14,v22,const1)
1062	lvx	v22,off96,r4
1063	VPERM(v22,v22,v22,byteswap)
1064	ori	r2,r2,0
1065
1066	VPMSUMD(v15,v23,const1)
1067	lvx	v23,off112,r4
1068	VPERM(v23,v23,v23,byteswap)
1069
1070	addi	r4,r4,8*16
1071
1072	bdz	.Lfirst_cool_down
1073
1074	/*
1075	 * main loop. We modulo schedule it such that it takes three iterations
1076	 * to complete - first iteration load, second iteration vpmsum, third
1077	 * iteration xor.
1078	 */
1079	.balign	16
10804:	lvx	const1,0,r3
1081	addi	r3,r3,16
1082	ori	r2,r2,0
1083
1084	vxor	v0,v0,v8
1085	VPMSUMD(v8,v16,const2)
1086	lvx	v16,0,r4
1087	VPERM(v16,v16,v16,byteswap)
1088	ori	r2,r2,0
1089
1090	vxor	v1,v1,v9
1091	VPMSUMD(v9,v17,const2)
1092	lvx	v17,off16,r4
1093	VPERM(v17,v17,v17,byteswap)
1094	ori	r2,r2,0
1095
1096	vxor	v2,v2,v10
1097	VPMSUMD(v10,v18,const2)
1098	lvx	v18,off32,r4
1099	VPERM(v18,v18,v18,byteswap)
1100	ori	r2,r2,0
1101
1102	vxor	v3,v3,v11
1103	VPMSUMD(v11,v19,const2)
1104	lvx	v19,off48,r4
1105	VPERM(v19,v19,v19,byteswap)
1106	lvx	const2,0,r3
1107	ori	r2,r2,0
1108
1109	vxor	v4,v4,v12
1110	VPMSUMD(v12,v20,const1)
1111	lvx	v20,off64,r4
1112	VPERM(v20,v20,v20,byteswap)
1113	ori	r2,r2,0
1114
1115	vxor	v5,v5,v13
1116	VPMSUMD(v13,v21,const1)
1117	lvx	v21,off80,r4
1118	VPERM(v21,v21,v21,byteswap)
1119	ori	r2,r2,0
1120
1121	vxor	v6,v6,v14
1122	VPMSUMD(v14,v22,const1)
1123	lvx	v22,off96,r4
1124	VPERM(v22,v22,v22,byteswap)
1125	ori	r2,r2,0
1126
1127	vxor	v7,v7,v15
1128	VPMSUMD(v15,v23,const1)
1129	lvx	v23,off112,r4
1130	VPERM(v23,v23,v23,byteswap)
1131
1132	addi	r4,r4,8*16
1133
1134	bdnz	4b
1135
1136.Lfirst_cool_down:
1137	/* First cool down pass */
1138	lvx	const1,0,r3
1139	addi	r3,r3,16
1140
1141	vxor	v0,v0,v8
1142	VPMSUMD(v8,v16,const1)
1143	ori	r2,r2,0
1144
1145	vxor	v1,v1,v9
1146	VPMSUMD(v9,v17,const1)
1147	ori	r2,r2,0
1148
1149	vxor	v2,v2,v10
1150	VPMSUMD(v10,v18,const1)
1151	ori	r2,r2,0
1152
1153	vxor	v3,v3,v11
1154	VPMSUMD(v11,v19,const1)
1155	ori	r2,r2,0
1156
1157	vxor	v4,v4,v12
1158	VPMSUMD(v12,v20,const1)
1159	ori	r2,r2,0
1160
1161	vxor	v5,v5,v13
1162	VPMSUMD(v13,v21,const1)
1163	ori	r2,r2,0
1164
1165	vxor	v6,v6,v14
1166	VPMSUMD(v14,v22,const1)
1167	ori	r2,r2,0
1168
1169	vxor	v7,v7,v15
1170	VPMSUMD(v15,v23,const1)
1171	ori	r2,r2,0
1172
1173.Lsecond_cool_down:
1174	/* Second cool down pass */
1175	vxor	v0,v0,v8
1176	vxor	v1,v1,v9
1177	vxor	v2,v2,v10
1178	vxor	v3,v3,v11
1179	vxor	v4,v4,v12
1180	vxor	v5,v5,v13
1181	vxor	v6,v6,v14
1182	vxor	v7,v7,v15
1183
1184	/*
1185	 * vpmsumd produces a 96 bit result in the least significant bits
1186	 * of the register. Since we are bit reflected we have to shift it
1187	 * left 32 bits so it occupies the least significant bits in the
1188	 * bit reflected domain.
1189	 */
1190	vsldoi	v0,v0,zeroes,4
1191	vsldoi	v1,v1,zeroes,4
1192	vsldoi	v2,v2,zeroes,4
1193	vsldoi	v3,v3,zeroes,4
1194	vsldoi	v4,v4,zeroes,4
1195	vsldoi	v5,v5,zeroes,4
1196	vsldoi	v6,v6,zeroes,4
1197	vsldoi	v7,v7,zeroes,4
1198
1199	/* xor with last 1024 bits */
1200	lvx	v8,0,r4
1201	lvx	v9,off16,r4
1202	VPERM(v8,v8,v8,byteswap)
1203	VPERM(v9,v9,v9,byteswap)
1204	lvx	v10,off32,r4
1205	lvx	v11,off48,r4
1206	VPERM(v10,v10,v10,byteswap)
1207	VPERM(v11,v11,v11,byteswap)
1208	lvx	v12,off64,r4
1209	lvx	v13,off80,r4
1210	VPERM(v12,v12,v12,byteswap)
1211	VPERM(v13,v13,v13,byteswap)
1212	lvx	v14,off96,r4
1213	lvx	v15,off112,r4
1214	VPERM(v14,v14,v14,byteswap)
1215	VPERM(v15,v15,v15,byteswap)
1216
1217	addi	r4,r4,8*16
1218
1219	vxor	v16,v0,v8
1220	vxor	v17,v1,v9
1221	vxor	v18,v2,v10
1222	vxor	v19,v3,v11
1223	vxor	v20,v4,v12
1224	vxor	v21,v5,v13
1225	vxor	v22,v6,v14
1226	vxor	v23,v7,v15
1227
1228	li	r0,1
1229	cmpdi	r6,0
1230	addi	r6,r6,128
1231	bne	1b
1232
1233	/* Work out how many bytes we have left */
1234	andi.	r5,r5,127
1235
1236	/* Calculate where in the constant table we need to start */
1237	subfic	r6,r5,128
1238	add	r3,r3,r6
1239
1240	/* How many 16 byte chunks are in the tail */
1241	srdi	r7,r5,4
1242	mtctr	r7
1243
1244	/*
1245	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
1246	 * 32 bits to include the trailing 32 bits of zeros
1247	 */
1248	lvx	v0,0,r3
1249	lvx	v1,off16,r3
1250	lvx	v2,off32,r3
1251	lvx	v3,off48,r3
1252	lvx	v4,off64,r3
1253	lvx	v5,off80,r3
1254	lvx	v6,off96,r3
1255	lvx	v7,off112,r3
1256	addi	r3,r3,8*16
1257
1258	VPMSUMW(v0,v16,v0)
1259	VPMSUMW(v1,v17,v1)
1260	VPMSUMW(v2,v18,v2)
1261	VPMSUMW(v3,v19,v3)
1262	VPMSUMW(v4,v20,v4)
1263	VPMSUMW(v5,v21,v5)
1264	VPMSUMW(v6,v22,v6)
1265	VPMSUMW(v7,v23,v7)
1266
1267	/* Now reduce the tail (0 - 112 bytes) */
1268	cmpdi	r7,0
1269	beq	1f
1270
1271	lvx	v16,0,r4
1272	lvx	v17,0,r3
1273	VPERM(v16,v16,v16,byteswap)
1274	VPMSUMW(v16,v16,v17)
1275	vxor	v0,v0,v16
1276	bdz	1f
1277
1278	lvx	v16,off16,r4
1279	lvx	v17,off16,r3
1280	VPERM(v16,v16,v16,byteswap)
1281	VPMSUMW(v16,v16,v17)
1282	vxor	v0,v0,v16
1283	bdz	1f
1284
1285	lvx	v16,off32,r4
1286	lvx	v17,off32,r3
1287	VPERM(v16,v16,v16,byteswap)
1288	VPMSUMW(v16,v16,v17)
1289	vxor	v0,v0,v16
1290	bdz	1f
1291
1292	lvx	v16,off48,r4
1293	lvx	v17,off48,r3
1294	VPERM(v16,v16,v16,byteswap)
1295	VPMSUMW(v16,v16,v17)
1296	vxor	v0,v0,v16
1297	bdz	1f
1298
1299	lvx	v16,off64,r4
1300	lvx	v17,off64,r3
1301	VPERM(v16,v16,v16,byteswap)
1302	VPMSUMW(v16,v16,v17)
1303	vxor	v0,v0,v16
1304	bdz	1f
1305
1306	lvx	v16,off80,r4
1307	lvx	v17,off80,r3
1308	VPERM(v16,v16,v16,byteswap)
1309	VPMSUMW(v16,v16,v17)
1310	vxor	v0,v0,v16
1311	bdz	1f
1312
1313	lvx	v16,off96,r4
1314	lvx	v17,off96,r3
1315	VPERM(v16,v16,v16,byteswap)
1316	VPMSUMW(v16,v16,v17)
1317	vxor	v0,v0,v16
1318
1319	/* Now xor all the parallel chunks together */
13201:	vxor	v0,v0,v1
1321	vxor	v2,v2,v3
1322	vxor	v4,v4,v5
1323	vxor	v6,v6,v7
1324
1325	vxor	v0,v0,v2
1326	vxor	v4,v4,v6
1327
1328	vxor	v0,v0,v4
1329
1330.Lbarrett_reduction:
1331	/* Barrett constants */
1332	addis	r3,r2,.barrett_constants@toc@ha
1333	addi	r3,r3,.barrett_constants@toc@l
1334
1335	lvx	const1,0,r3
1336	lvx	const2,off16,r3
1337
1338	vsldoi	v1,v0,v0,8
1339	vxor	v0,v0,v1		/* xor two 64 bit results together */
1340
1341	/* shift left one bit */
1342	vspltisb v1,1
1343	vsl	v0,v0,v1
1344
1345	vand	v0,v0,mask_64bit
1346
1347	/*
1348	 * The reflected version of Barrett reduction. Instead of bit
1349	 * reflecting our data (which is expensive to do), we bit reflect our
1350	 * constants and our algorithm, which means the intermediate data in
1351	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
1352	 * the algorithm because we don't carry in mod 2 arithmetic.
1353	 */
1354	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
1355	VPMSUMD(v1,v1,const1)		/* ma */
1356	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
1357	VPMSUMD(v1,v1,const2)		/* qn */
1358	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
1359
1360	/*
1361	 * Since we are bit reflected, the result (ie the low 32 bits) is in
1362	 * the high 32 bits. We just need to shift it left 4 bytes
1363	 * V0 [ 0 1 X 3 ]
1364	 * V0 [ 0 X 2 3 ]
1365	 */
1366	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
1367
1368	/* Get it into r3 */
1369	MFVRD(R3, v0)
1370
1371.Lout:
1372	subi	r6,r1,56+10*16
1373	subi	r7,r1,56+2*16
1374
1375	lvx	v20,0,r6
1376	lvx	v21,off16,r6
1377	lvx	v22,off32,r6
1378	lvx	v23,off48,r6
1379	lvx	v24,off64,r6
1380	lvx	v25,off80,r6
1381	lvx	v26,off96,r6
1382	lvx	v27,off112,r6
1383	lvx	v28,0,r7
1384	lvx	v29,off16,r7
1385
1386	ld	r31,-8(r1)
1387	ld	r30,-16(r1)
1388	ld	r29,-24(r1)
1389	ld	r28,-32(r1)
1390	ld	r27,-40(r1)
1391	ld	r26,-48(r1)
1392	ld	r25,-56(r1)
1393
1394	blr
1395
1396.Lfirst_warm_up_done:
1397	lvx	const1,0,r3
1398	addi	r3,r3,16
1399
1400	VPMSUMD(v8,v16,const1)
1401	VPMSUMD(v9,v17,const1)
1402	VPMSUMD(v10,v18,const1)
1403	VPMSUMD(v11,v19,const1)
1404	VPMSUMD(v12,v20,const1)
1405	VPMSUMD(v13,v21,const1)
1406	VPMSUMD(v14,v22,const1)
1407	VPMSUMD(v15,v23,const1)
1408
1409	b	.Lsecond_cool_down
1410
1411.Lshort:
1412	cmpdi	r5,0
1413	beq	.Lzero
1414
1415	addis	r3,r2,.short_constants@toc@ha
1416	addi	r3,r3,.short_constants@toc@l
1417
1418	/* Calculate where in the constant table we need to start */
1419	subfic	r6,r5,256
1420	add	r3,r3,r6
1421
1422	/* How many 16 byte chunks? */
1423	srdi	r7,r5,4
1424	mtctr	r7
1425
1426	vxor	v19,v19,v19
1427	vxor	v20,v20,v20
1428
1429	lvx	v0,0,r4
1430	lvx	v16,0,r3
1431	VPERM(v0,v0,v16,byteswap)
1432	vxor	v0,v0,v8	/* xor in initial value */
1433	VPMSUMW(v0,v0,v16)
1434	bdz	.Lv0
1435
1436	lvx	v1,off16,r4
1437	lvx	v17,off16,r3
1438	VPERM(v1,v1,v17,byteswap)
1439	VPMSUMW(v1,v1,v17)
1440	bdz	.Lv1
1441
1442	lvx	v2,off32,r4
1443	lvx	v16,off32,r3
1444	VPERM(v2,v2,v16,byteswap)
1445	VPMSUMW(v2,v2,v16)
1446	bdz	.Lv2
1447
1448	lvx	v3,off48,r4
1449	lvx	v17,off48,r3
1450	VPERM(v3,v3,v17,byteswap)
1451	VPMSUMW(v3,v3,v17)
1452	bdz	.Lv3
1453
1454	lvx	v4,off64,r4
1455	lvx	v16,off64,r3
1456	VPERM(v4,v4,v16,byteswap)
1457	VPMSUMW(v4,v4,v16)
1458	bdz	.Lv4
1459
1460	lvx	v5,off80,r4
1461	lvx	v17,off80,r3
1462	VPERM(v5,v5,v17,byteswap)
1463	VPMSUMW(v5,v5,v17)
1464	bdz	.Lv5
1465
1466	lvx	v6,off96,r4
1467	lvx	v16,off96,r3
1468	VPERM(v6,v6,v16,byteswap)
1469	VPMSUMW(v6,v6,v16)
1470	bdz	.Lv6
1471
1472	lvx	v7,off112,r4
1473	lvx	v17,off112,r3
1474	VPERM(v7,v7,v17,byteswap)
1475	VPMSUMW(v7,v7,v17)
1476	bdz	.Lv7
1477
1478	addi	r3,r3,128
1479	addi	r4,r4,128
1480
1481	lvx	v8,0,r4
1482	lvx	v16,0,r3
1483	VPERM(v8,v8,v16,byteswap)
1484	VPMSUMW(v8,v8,v16)
1485	bdz	.Lv8
1486
1487	lvx	v9,off16,r4
1488	lvx	v17,off16,r3
1489	VPERM(v9,v9,v17,byteswap)
1490	VPMSUMW(v9,v9,v17)
1491	bdz	.Lv9
1492
1493	lvx	v10,off32,r4
1494	lvx	v16,off32,r3
1495	VPERM(v10,v10,v16,byteswap)
1496	VPMSUMW(v10,v10,v16)
1497	bdz	.Lv10
1498
1499	lvx	v11,off48,r4
1500	lvx	v17,off48,r3
1501	VPERM(v11,v11,v17,byteswap)
1502	VPMSUMW(v11,v11,v17)
1503	bdz	.Lv11
1504
1505	lvx	v12,off64,r4
1506	lvx	v16,off64,r3
1507	VPERM(v12,v12,v16,byteswap)
1508	VPMSUMW(v12,v12,v16)
1509	bdz	.Lv12
1510
1511	lvx	v13,off80,r4
1512	lvx	v17,off80,r3
1513	VPERM(v13,v13,v17,byteswap)
1514	VPMSUMW(v13,v13,v17)
1515	bdz	.Lv13
1516
1517	lvx	v14,off96,r4
1518	lvx	v16,off96,r3
1519	VPERM(v14,v14,v16,byteswap)
1520	VPMSUMW(v14,v14,v16)
1521	bdz	.Lv14
1522
1523	lvx	v15,off112,r4
1524	lvx	v17,off112,r3
1525	VPERM(v15,v15,v17,byteswap)
1526	VPMSUMW(v15,v15,v17)
1527
1528.Lv15:	vxor	v19,v19,v15
1529.Lv14:	vxor	v20,v20,v14
1530.Lv13:	vxor	v19,v19,v13
1531.Lv12:	vxor	v20,v20,v12
1532.Lv11:	vxor	v19,v19,v11
1533.Lv10:	vxor	v20,v20,v10
1534.Lv9:	vxor	v19,v19,v9
1535.Lv8:	vxor	v20,v20,v8
1536.Lv7:	vxor	v19,v19,v7
1537.Lv6:	vxor	v20,v20,v6
1538.Lv5:	vxor	v19,v19,v5
1539.Lv4:	vxor	v20,v20,v4
1540.Lv3:	vxor	v19,v19,v3
1541.Lv2:	vxor	v20,v20,v2
1542.Lv1:	vxor	v19,v19,v1
1543.Lv0:	vxor	v20,v20,v0
1544
1545	vxor	v0,v19,v20
1546
1547	b	.Lbarrett_reduction
1548
1549.Lzero:
1550	mr	r3,r10
1551	b	.Lout
1552
1553FUNC_END(__crc32_vpmsum)
1554