xref: /openbmc/linux/arch/powerpc/crypto/crc32-vpmsum_core.S (revision f79e4d5f92a129a1159c973735007d4ddc8541f3)
1/*
2 * Core of the accelerated CRC algorithm.
3 * In your file, define the constants and CRC_FUNCTION_NAME
4 * Then include this file.
5 *
6 * Calculate the checksum of data that is 16 byte aligned and a multiple of
7 * 16 bytes.
8 *
9 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
10 * chunks in order to mask the latency of the vpmsum instructions. If we
11 * have more than 32 kB of data to checksum we repeat this step multiple
12 * times, passing in the previous 1024 bits.
13 *
14 * The next step is to reduce the 1024 bits to 64 bits. This step adds
15 * 32 bits of 0s to the end - this matches what a CRC does. We just
16 * calculate constants that land the data in this 32 bits.
17 *
18 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
19 * for n = CRC using POWER8 instructions. We use x = 32.
20 *
21 * http://en.wikipedia.org/wiki/Barrett_reduction
22 *
23 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
24 *
25 * This program is free software; you can redistribute it and/or
26 * modify it under the terms of the GNU General Public License
27 * as published by the Free Software Foundation; either version
28 * 2 of the License, or (at your option) any later version.
29*/
30
31#include <asm/ppc_asm.h>
32#include <asm/ppc-opcode.h>
33
34#define MAX_SIZE	32768
35
36	.text
37
38#if defined(__BIG_ENDIAN__) && defined(REFLECT)
39#define BYTESWAP_DATA
40#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
41#define BYTESWAP_DATA
42#else
43#undef BYTESWAP_DATA
44#endif
45
46#define off16		r25
47#define off32		r26
48#define off48		r27
49#define off64		r28
50#define off80		r29
51#define off96		r30
52#define off112		r31
53
54#define const1		v24
55#define const2		v25
56
57#define byteswap	v26
58#define	mask_32bit	v27
59#define	mask_64bit	v28
60#define zeroes		v29
61
62#ifdef BYTESWAP_DATA
63#define VPERM(A, B, C, D) vperm	A, B, C, D
64#else
65#define VPERM(A, B, C, D)
66#endif
67
68/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
69FUNC_START(CRC_FUNCTION_NAME)
70	std	r31,-8(r1)
71	std	r30,-16(r1)
72	std	r29,-24(r1)
73	std	r28,-32(r1)
74	std	r27,-40(r1)
75	std	r26,-48(r1)
76	std	r25,-56(r1)
77
78	li	off16,16
79	li	off32,32
80	li	off48,48
81	li	off64,64
82	li	off80,80
83	li	off96,96
84	li	off112,112
85	li	r0,0
86
87	/* Enough room for saving 10 non volatile VMX registers */
88	subi	r6,r1,56+10*16
89	subi	r7,r1,56+2*16
90
91	stvx	v20,0,r6
92	stvx	v21,off16,r6
93	stvx	v22,off32,r6
94	stvx	v23,off48,r6
95	stvx	v24,off64,r6
96	stvx	v25,off80,r6
97	stvx	v26,off96,r6
98	stvx	v27,off112,r6
99	stvx	v28,0,r7
100	stvx	v29,off16,r7
101
102	mr	r10,r3
103
104	vxor	zeroes,zeroes,zeroes
105	vspltisw v0,-1
106
107	vsldoi	mask_32bit,zeroes,v0,4
108	vsldoi	mask_64bit,zeroes,v0,8
109
110	/* Get the initial value into v8 */
111	vxor	v8,v8,v8
112	MTVRD(v8, R3)
113#ifdef REFLECT
114	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
115#else
116	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
117#endif
118
119#ifdef BYTESWAP_DATA
120	addis	r3,r2,.byteswap_constant@toc@ha
121	addi	r3,r3,.byteswap_constant@toc@l
122
123	lvx	byteswap,0,r3
124	addi	r3,r3,16
125#endif
126
127	cmpdi	r5,256
128	blt	.Lshort
129
130	rldicr	r6,r5,0,56
131
132	/* Checksum in blocks of MAX_SIZE */
1331:	lis	r7,MAX_SIZE@h
134	ori	r7,r7,MAX_SIZE@l
135	mr	r9,r7
136	cmpd	r6,r7
137	bgt	2f
138	mr	r7,r6
1392:	subf	r6,r7,r6
140
141	/* our main loop does 128 bytes at a time */
142	srdi	r7,r7,7
143
144	/*
145	 * Work out the offset into the constants table to start at. Each
146	 * constant is 16 bytes, and it is used against 128 bytes of input
147	 * data - 128 / 16 = 8
148	 */
149	sldi	r8,r7,4
150	srdi	r9,r9,3
151	subf	r8,r8,r9
152
153	/* We reduce our final 128 bytes in a separate step */
154	addi	r7,r7,-1
155	mtctr	r7
156
157	addis	r3,r2,.constants@toc@ha
158	addi	r3,r3,.constants@toc@l
159
160	/* Find the start of our constants */
161	add	r3,r3,r8
162
163	/* zero v0-v7 which will contain our checksums */
164	vxor	v0,v0,v0
165	vxor	v1,v1,v1
166	vxor	v2,v2,v2
167	vxor	v3,v3,v3
168	vxor	v4,v4,v4
169	vxor	v5,v5,v5
170	vxor	v6,v6,v6
171	vxor	v7,v7,v7
172
173	lvx	const1,0,r3
174
175	/*
176	 * If we are looping back to consume more data we use the values
177	 * already in v16-v23.
178	 */
179	cmpdi	r0,1
180	beq	2f
181
182	/* First warm up pass */
183	lvx	v16,0,r4
184	lvx	v17,off16,r4
185	VPERM(v16,v16,v16,byteswap)
186	VPERM(v17,v17,v17,byteswap)
187	lvx	v18,off32,r4
188	lvx	v19,off48,r4
189	VPERM(v18,v18,v18,byteswap)
190	VPERM(v19,v19,v19,byteswap)
191	lvx	v20,off64,r4
192	lvx	v21,off80,r4
193	VPERM(v20,v20,v20,byteswap)
194	VPERM(v21,v21,v21,byteswap)
195	lvx	v22,off96,r4
196	lvx	v23,off112,r4
197	VPERM(v22,v22,v22,byteswap)
198	VPERM(v23,v23,v23,byteswap)
199	addi	r4,r4,8*16
200
201	/* xor in initial value */
202	vxor	v16,v16,v8
203
2042:	bdz	.Lfirst_warm_up_done
205
206	addi	r3,r3,16
207	lvx	const2,0,r3
208
209	/* Second warm up pass */
210	VPMSUMD(v8,v16,const1)
211	lvx	v16,0,r4
212	VPERM(v16,v16,v16,byteswap)
213	ori	r2,r2,0
214
215	VPMSUMD(v9,v17,const1)
216	lvx	v17,off16,r4
217	VPERM(v17,v17,v17,byteswap)
218	ori	r2,r2,0
219
220	VPMSUMD(v10,v18,const1)
221	lvx	v18,off32,r4
222	VPERM(v18,v18,v18,byteswap)
223	ori	r2,r2,0
224
225	VPMSUMD(v11,v19,const1)
226	lvx	v19,off48,r4
227	VPERM(v19,v19,v19,byteswap)
228	ori	r2,r2,0
229
230	VPMSUMD(v12,v20,const1)
231	lvx	v20,off64,r4
232	VPERM(v20,v20,v20,byteswap)
233	ori	r2,r2,0
234
235	VPMSUMD(v13,v21,const1)
236	lvx	v21,off80,r4
237	VPERM(v21,v21,v21,byteswap)
238	ori	r2,r2,0
239
240	VPMSUMD(v14,v22,const1)
241	lvx	v22,off96,r4
242	VPERM(v22,v22,v22,byteswap)
243	ori	r2,r2,0
244
245	VPMSUMD(v15,v23,const1)
246	lvx	v23,off112,r4
247	VPERM(v23,v23,v23,byteswap)
248
249	addi	r4,r4,8*16
250
251	bdz	.Lfirst_cool_down
252
253	/*
254	 * main loop. We modulo schedule it such that it takes three iterations
255	 * to complete - first iteration load, second iteration vpmsum, third
256	 * iteration xor.
257	 */
258	.balign	16
2594:	lvx	const1,0,r3
260	addi	r3,r3,16
261	ori	r2,r2,0
262
263	vxor	v0,v0,v8
264	VPMSUMD(v8,v16,const2)
265	lvx	v16,0,r4
266	VPERM(v16,v16,v16,byteswap)
267	ori	r2,r2,0
268
269	vxor	v1,v1,v9
270	VPMSUMD(v9,v17,const2)
271	lvx	v17,off16,r4
272	VPERM(v17,v17,v17,byteswap)
273	ori	r2,r2,0
274
275	vxor	v2,v2,v10
276	VPMSUMD(v10,v18,const2)
277	lvx	v18,off32,r4
278	VPERM(v18,v18,v18,byteswap)
279	ori	r2,r2,0
280
281	vxor	v3,v3,v11
282	VPMSUMD(v11,v19,const2)
283	lvx	v19,off48,r4
284	VPERM(v19,v19,v19,byteswap)
285	lvx	const2,0,r3
286	ori	r2,r2,0
287
288	vxor	v4,v4,v12
289	VPMSUMD(v12,v20,const1)
290	lvx	v20,off64,r4
291	VPERM(v20,v20,v20,byteswap)
292	ori	r2,r2,0
293
294	vxor	v5,v5,v13
295	VPMSUMD(v13,v21,const1)
296	lvx	v21,off80,r4
297	VPERM(v21,v21,v21,byteswap)
298	ori	r2,r2,0
299
300	vxor	v6,v6,v14
301	VPMSUMD(v14,v22,const1)
302	lvx	v22,off96,r4
303	VPERM(v22,v22,v22,byteswap)
304	ori	r2,r2,0
305
306	vxor	v7,v7,v15
307	VPMSUMD(v15,v23,const1)
308	lvx	v23,off112,r4
309	VPERM(v23,v23,v23,byteswap)
310
311	addi	r4,r4,8*16
312
313	bdnz	4b
314
315.Lfirst_cool_down:
316	/* First cool down pass */
317	lvx	const1,0,r3
318	addi	r3,r3,16
319
320	vxor	v0,v0,v8
321	VPMSUMD(v8,v16,const1)
322	ori	r2,r2,0
323
324	vxor	v1,v1,v9
325	VPMSUMD(v9,v17,const1)
326	ori	r2,r2,0
327
328	vxor	v2,v2,v10
329	VPMSUMD(v10,v18,const1)
330	ori	r2,r2,0
331
332	vxor	v3,v3,v11
333	VPMSUMD(v11,v19,const1)
334	ori	r2,r2,0
335
336	vxor	v4,v4,v12
337	VPMSUMD(v12,v20,const1)
338	ori	r2,r2,0
339
340	vxor	v5,v5,v13
341	VPMSUMD(v13,v21,const1)
342	ori	r2,r2,0
343
344	vxor	v6,v6,v14
345	VPMSUMD(v14,v22,const1)
346	ori	r2,r2,0
347
348	vxor	v7,v7,v15
349	VPMSUMD(v15,v23,const1)
350	ori	r2,r2,0
351
352.Lsecond_cool_down:
353	/* Second cool down pass */
354	vxor	v0,v0,v8
355	vxor	v1,v1,v9
356	vxor	v2,v2,v10
357	vxor	v3,v3,v11
358	vxor	v4,v4,v12
359	vxor	v5,v5,v13
360	vxor	v6,v6,v14
361	vxor	v7,v7,v15
362
363#ifdef REFLECT
364	/*
365	 * vpmsumd produces a 96 bit result in the least significant bits
366	 * of the register. Since we are bit reflected we have to shift it
367	 * left 32 bits so it occupies the least significant bits in the
368	 * bit reflected domain.
369	 */
370	vsldoi	v0,v0,zeroes,4
371	vsldoi	v1,v1,zeroes,4
372	vsldoi	v2,v2,zeroes,4
373	vsldoi	v3,v3,zeroes,4
374	vsldoi	v4,v4,zeroes,4
375	vsldoi	v5,v5,zeroes,4
376	vsldoi	v6,v6,zeroes,4
377	vsldoi	v7,v7,zeroes,4
378#endif
379
380	/* xor with last 1024 bits */
381	lvx	v8,0,r4
382	lvx	v9,off16,r4
383	VPERM(v8,v8,v8,byteswap)
384	VPERM(v9,v9,v9,byteswap)
385	lvx	v10,off32,r4
386	lvx	v11,off48,r4
387	VPERM(v10,v10,v10,byteswap)
388	VPERM(v11,v11,v11,byteswap)
389	lvx	v12,off64,r4
390	lvx	v13,off80,r4
391	VPERM(v12,v12,v12,byteswap)
392	VPERM(v13,v13,v13,byteswap)
393	lvx	v14,off96,r4
394	lvx	v15,off112,r4
395	VPERM(v14,v14,v14,byteswap)
396	VPERM(v15,v15,v15,byteswap)
397
398	addi	r4,r4,8*16
399
400	vxor	v16,v0,v8
401	vxor	v17,v1,v9
402	vxor	v18,v2,v10
403	vxor	v19,v3,v11
404	vxor	v20,v4,v12
405	vxor	v21,v5,v13
406	vxor	v22,v6,v14
407	vxor	v23,v7,v15
408
409	li	r0,1
410	cmpdi	r6,0
411	addi	r6,r6,128
412	bne	1b
413
414	/* Work out how many bytes we have left */
415	andi.	r5,r5,127
416
417	/* Calculate where in the constant table we need to start */
418	subfic	r6,r5,128
419	add	r3,r3,r6
420
421	/* How many 16 byte chunks are in the tail */
422	srdi	r7,r5,4
423	mtctr	r7
424
425	/*
426	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
427	 * 32 bits to include the trailing 32 bits of zeros
428	 */
429	lvx	v0,0,r3
430	lvx	v1,off16,r3
431	lvx	v2,off32,r3
432	lvx	v3,off48,r3
433	lvx	v4,off64,r3
434	lvx	v5,off80,r3
435	lvx	v6,off96,r3
436	lvx	v7,off112,r3
437	addi	r3,r3,8*16
438
439	VPMSUMW(v0,v16,v0)
440	VPMSUMW(v1,v17,v1)
441	VPMSUMW(v2,v18,v2)
442	VPMSUMW(v3,v19,v3)
443	VPMSUMW(v4,v20,v4)
444	VPMSUMW(v5,v21,v5)
445	VPMSUMW(v6,v22,v6)
446	VPMSUMW(v7,v23,v7)
447
448	/* Now reduce the tail (0 - 112 bytes) */
449	cmpdi	r7,0
450	beq	1f
451
452	lvx	v16,0,r4
453	lvx	v17,0,r3
454	VPERM(v16,v16,v16,byteswap)
455	VPMSUMW(v16,v16,v17)
456	vxor	v0,v0,v16
457	bdz	1f
458
459	lvx	v16,off16,r4
460	lvx	v17,off16,r3
461	VPERM(v16,v16,v16,byteswap)
462	VPMSUMW(v16,v16,v17)
463	vxor	v0,v0,v16
464	bdz	1f
465
466	lvx	v16,off32,r4
467	lvx	v17,off32,r3
468	VPERM(v16,v16,v16,byteswap)
469	VPMSUMW(v16,v16,v17)
470	vxor	v0,v0,v16
471	bdz	1f
472
473	lvx	v16,off48,r4
474	lvx	v17,off48,r3
475	VPERM(v16,v16,v16,byteswap)
476	VPMSUMW(v16,v16,v17)
477	vxor	v0,v0,v16
478	bdz	1f
479
480	lvx	v16,off64,r4
481	lvx	v17,off64,r3
482	VPERM(v16,v16,v16,byteswap)
483	VPMSUMW(v16,v16,v17)
484	vxor	v0,v0,v16
485	bdz	1f
486
487	lvx	v16,off80,r4
488	lvx	v17,off80,r3
489	VPERM(v16,v16,v16,byteswap)
490	VPMSUMW(v16,v16,v17)
491	vxor	v0,v0,v16
492	bdz	1f
493
494	lvx	v16,off96,r4
495	lvx	v17,off96,r3
496	VPERM(v16,v16,v16,byteswap)
497	VPMSUMW(v16,v16,v17)
498	vxor	v0,v0,v16
499
500	/* Now xor all the parallel chunks together */
5011:	vxor	v0,v0,v1
502	vxor	v2,v2,v3
503	vxor	v4,v4,v5
504	vxor	v6,v6,v7
505
506	vxor	v0,v0,v2
507	vxor	v4,v4,v6
508
509	vxor	v0,v0,v4
510
511.Lbarrett_reduction:
512	/* Barrett constants */
513	addis	r3,r2,.barrett_constants@toc@ha
514	addi	r3,r3,.barrett_constants@toc@l
515
516	lvx	const1,0,r3
517	lvx	const2,off16,r3
518
519	vsldoi	v1,v0,v0,8
520	vxor	v0,v0,v1		/* xor two 64 bit results together */
521
522#ifdef REFLECT
523	/* shift left one bit */
524	vspltisb v1,1
525	vsl	v0,v0,v1
526#endif
527
528	vand	v0,v0,mask_64bit
529#ifndef REFLECT
530	/*
531	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
532	 * the multiple of our polynomial that we need to subtract. By
533	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
534	 * result back down 2x bits, we round down to the nearest multiple.
535	 */
536	VPMSUMD(v1,v0,const1)	/* ma */
537	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
538	VPMSUMD(v1,v1,const2)	/* qn */
539	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
540
541	/*
542	 * Get the result into r3. We need to shift it left 8 bytes:
543	 * V0 [ 0 1 2 X ]
544	 * V0 [ 0 X 2 3 ]
545	 */
546	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
547#else
548	/*
549	 * The reflected version of Barrett reduction. Instead of bit
550	 * reflecting our data (which is expensive to do), we bit reflect our
551	 * constants and our algorithm, which means the intermediate data in
552	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
553	 * the algorithm because we don't carry in mod 2 arithmetic.
554	 */
555	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
556	VPMSUMD(v1,v1,const1)		/* ma */
557	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
558	VPMSUMD(v1,v1,const2)		/* qn */
559	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
560
561	/*
562	 * Since we are bit reflected, the result (ie the low 32 bits) is in
563	 * the high 32 bits. We just need to shift it left 4 bytes
564	 * V0 [ 0 1 X 3 ]
565	 * V0 [ 0 X 2 3 ]
566	 */
567	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
568#endif
569
570	/* Get it into r3 */
571	MFVRD(R3, v0)
572
573.Lout:
574	subi	r6,r1,56+10*16
575	subi	r7,r1,56+2*16
576
577	lvx	v20,0,r6
578	lvx	v21,off16,r6
579	lvx	v22,off32,r6
580	lvx	v23,off48,r6
581	lvx	v24,off64,r6
582	lvx	v25,off80,r6
583	lvx	v26,off96,r6
584	lvx	v27,off112,r6
585	lvx	v28,0,r7
586	lvx	v29,off16,r7
587
588	ld	r31,-8(r1)
589	ld	r30,-16(r1)
590	ld	r29,-24(r1)
591	ld	r28,-32(r1)
592	ld	r27,-40(r1)
593	ld	r26,-48(r1)
594	ld	r25,-56(r1)
595
596	blr
597
598.Lfirst_warm_up_done:
599	lvx	const1,0,r3
600	addi	r3,r3,16
601
602	VPMSUMD(v8,v16,const1)
603	VPMSUMD(v9,v17,const1)
604	VPMSUMD(v10,v18,const1)
605	VPMSUMD(v11,v19,const1)
606	VPMSUMD(v12,v20,const1)
607	VPMSUMD(v13,v21,const1)
608	VPMSUMD(v14,v22,const1)
609	VPMSUMD(v15,v23,const1)
610
611	b	.Lsecond_cool_down
612
613.Lshort:
614	cmpdi	r5,0
615	beq	.Lzero
616
617	addis	r3,r2,.short_constants@toc@ha
618	addi	r3,r3,.short_constants@toc@l
619
620	/* Calculate where in the constant table we need to start */
621	subfic	r6,r5,256
622	add	r3,r3,r6
623
624	/* How many 16 byte chunks? */
625	srdi	r7,r5,4
626	mtctr	r7
627
628	vxor	v19,v19,v19
629	vxor	v20,v20,v20
630
631	lvx	v0,0,r4
632	lvx	v16,0,r3
633	VPERM(v0,v0,v16,byteswap)
634	vxor	v0,v0,v8	/* xor in initial value */
635	VPMSUMW(v0,v0,v16)
636	bdz	.Lv0
637
638	lvx	v1,off16,r4
639	lvx	v17,off16,r3
640	VPERM(v1,v1,v17,byteswap)
641	VPMSUMW(v1,v1,v17)
642	bdz	.Lv1
643
644	lvx	v2,off32,r4
645	lvx	v16,off32,r3
646	VPERM(v2,v2,v16,byteswap)
647	VPMSUMW(v2,v2,v16)
648	bdz	.Lv2
649
650	lvx	v3,off48,r4
651	lvx	v17,off48,r3
652	VPERM(v3,v3,v17,byteswap)
653	VPMSUMW(v3,v3,v17)
654	bdz	.Lv3
655
656	lvx	v4,off64,r4
657	lvx	v16,off64,r3
658	VPERM(v4,v4,v16,byteswap)
659	VPMSUMW(v4,v4,v16)
660	bdz	.Lv4
661
662	lvx	v5,off80,r4
663	lvx	v17,off80,r3
664	VPERM(v5,v5,v17,byteswap)
665	VPMSUMW(v5,v5,v17)
666	bdz	.Lv5
667
668	lvx	v6,off96,r4
669	lvx	v16,off96,r3
670	VPERM(v6,v6,v16,byteswap)
671	VPMSUMW(v6,v6,v16)
672	bdz	.Lv6
673
674	lvx	v7,off112,r4
675	lvx	v17,off112,r3
676	VPERM(v7,v7,v17,byteswap)
677	VPMSUMW(v7,v7,v17)
678	bdz	.Lv7
679
680	addi	r3,r3,128
681	addi	r4,r4,128
682
683	lvx	v8,0,r4
684	lvx	v16,0,r3
685	VPERM(v8,v8,v16,byteswap)
686	VPMSUMW(v8,v8,v16)
687	bdz	.Lv8
688
689	lvx	v9,off16,r4
690	lvx	v17,off16,r3
691	VPERM(v9,v9,v17,byteswap)
692	VPMSUMW(v9,v9,v17)
693	bdz	.Lv9
694
695	lvx	v10,off32,r4
696	lvx	v16,off32,r3
697	VPERM(v10,v10,v16,byteswap)
698	VPMSUMW(v10,v10,v16)
699	bdz	.Lv10
700
701	lvx	v11,off48,r4
702	lvx	v17,off48,r3
703	VPERM(v11,v11,v17,byteswap)
704	VPMSUMW(v11,v11,v17)
705	bdz	.Lv11
706
707	lvx	v12,off64,r4
708	lvx	v16,off64,r3
709	VPERM(v12,v12,v16,byteswap)
710	VPMSUMW(v12,v12,v16)
711	bdz	.Lv12
712
713	lvx	v13,off80,r4
714	lvx	v17,off80,r3
715	VPERM(v13,v13,v17,byteswap)
716	VPMSUMW(v13,v13,v17)
717	bdz	.Lv13
718
719	lvx	v14,off96,r4
720	lvx	v16,off96,r3
721	VPERM(v14,v14,v16,byteswap)
722	VPMSUMW(v14,v14,v16)
723	bdz	.Lv14
724
725	lvx	v15,off112,r4
726	lvx	v17,off112,r3
727	VPERM(v15,v15,v17,byteswap)
728	VPMSUMW(v15,v15,v17)
729
730.Lv15:	vxor	v19,v19,v15
731.Lv14:	vxor	v20,v20,v14
732.Lv13:	vxor	v19,v19,v13
733.Lv12:	vxor	v20,v20,v12
734.Lv11:	vxor	v19,v19,v11
735.Lv10:	vxor	v20,v20,v10
736.Lv9:	vxor	v19,v19,v9
737.Lv8:	vxor	v20,v20,v8
738.Lv7:	vxor	v19,v19,v7
739.Lv6:	vxor	v20,v20,v6
740.Lv5:	vxor	v19,v19,v5
741.Lv4:	vxor	v20,v20,v4
742.Lv3:	vxor	v19,v19,v3
743.Lv2:	vxor	v20,v20,v2
744.Lv1:	vxor	v19,v19,v1
745.Lv0:	vxor	v20,v20,v0
746
747	vxor	v0,v19,v20
748
749	b	.Lbarrett_reduction
750
751.Lzero:
752	mr	r3,r10
753	b	.Lout
754
755FUNC_END(CRC_FUNCTION_NAME)
756