xref: /openbmc/linux/arch/powerpc/lib/memcmp_64.S (revision 5a1ea477)
1/*
2 * Author: Anton Blanchard <anton@au.ibm.com>
3 * Copyright 2015 IBM Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 */
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/ppc-opcode.h>
13
14#define off8	r6
15#define off16	r7
16#define off24	r8
17
18#define rA	r9
19#define rB	r10
20#define rC	r11
21#define rD	r27
22#define rE	r28
23#define rF	r29
24#define rG	r30
25#define rH	r31
26
27#ifdef __LITTLE_ENDIAN__
28#define LH	lhbrx
29#define LW	lwbrx
30#define LD	ldbrx
31#define LVS	lvsr
32#define VPERM(_VRT,_VRA,_VRB,_VRC) \
33	vperm _VRT,_VRB,_VRA,_VRC
34#else
35#define LH	lhzx
36#define LW	lwzx
37#define LD	ldx
38#define LVS	lvsl
39#define VPERM(_VRT,_VRA,_VRB,_VRC) \
40	vperm _VRT,_VRA,_VRB,_VRC
41#endif
42
43#define VMX_THRESH 4096
44#define ENTER_VMX_OPS	\
45	mflr    r0;	\
46	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
47	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
48	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
49	std     r0,16(r1); \
50	stdu    r1,-STACKFRAMESIZE(r1); \
51	bl      enter_vmx_ops; \
52	cmpwi   cr1,r3,0; \
53	ld      r0,STACKFRAMESIZE+16(r1); \
54	ld      r3,STK_REG(R31)(r1); \
55	ld      r4,STK_REG(R30)(r1); \
56	ld      r5,STK_REG(R29)(r1); \
57	addi	r1,r1,STACKFRAMESIZE; \
58	mtlr    r0
59
60#define EXIT_VMX_OPS \
61	mflr    r0; \
62	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
63	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
64	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
65	std     r0,16(r1); \
66	stdu    r1,-STACKFRAMESIZE(r1); \
67	bl      exit_vmx_ops; \
68	ld      r0,STACKFRAMESIZE+16(r1); \
69	ld      r3,STK_REG(R31)(r1); \
70	ld      r4,STK_REG(R30)(r1); \
71	ld      r5,STK_REG(R29)(r1); \
72	addi	r1,r1,STACKFRAMESIZE; \
73	mtlr    r0
74
75/*
76 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
77 * 16 bytes boundary and permute the result with the 1st 16 bytes.
78
79 *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
80 *    ^                                  ^                                 ^
81 * 0xbbbb10                          0xbbbb20                          0xbbb30
82 *                                 ^
83 *                                _vaddr
84 *
85 *
86 * _vmask is the mask generated by LVS
87 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
88 *   for example: 0xyyyyyyyyyyyyy012 for big endian
89 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
90 *   for example: 0x3456789abcdefzzz for big endian
91 * The permute result is saved in _v_res.
92 *   for example: 0x0123456789abcdef for big endian.
93 */
94#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
95        lvx     _v2nd_qw,_vaddr,off16; \
96        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
97
98/*
99 * There are 2 categories for memcmp:
100 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
101 * are named like .Lsameoffset_xxxx
102 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
103 * are named like .Ldiffoffset_xxxx
104 */
105_GLOBAL_TOC(memcmp)
106	cmpdi	cr1,r5,0
107
108	/* Use the short loop if the src/dst addresses are not
109	 * with the same offset of 8 bytes align boundary.
110	 */
111	xor	r6,r3,r4
112	andi.	r6,r6,7
113
114	/* Fall back to short loop if compare at aligned addrs
115	 * with less than 8 bytes.
116	 */
117	cmpdi   cr6,r5,7
118
119	beq	cr1,.Lzero
120	bgt	cr6,.Lno_short
121
122.Lshort:
123	mtctr	r5
1241:	lbz	rA,0(r3)
125	lbz	rB,0(r4)
126	subf.	rC,rB,rA
127	bne	.Lnon_zero
128	bdz	.Lzero
129
130	lbz	rA,1(r3)
131	lbz	rB,1(r4)
132	subf.	rC,rB,rA
133	bne	.Lnon_zero
134	bdz	.Lzero
135
136	lbz	rA,2(r3)
137	lbz	rB,2(r4)
138	subf.	rC,rB,rA
139	bne	.Lnon_zero
140	bdz	.Lzero
141
142	lbz	rA,3(r3)
143	lbz	rB,3(r4)
144	subf.	rC,rB,rA
145	bne	.Lnon_zero
146
147	addi	r3,r3,4
148	addi	r4,r4,4
149
150	bdnz	1b
151
152.Lzero:
153	li	r3,0
154	blr
155
156.Lno_short:
157	dcbt	0,r3
158	dcbt	0,r4
159	bne	.Ldiffoffset_8bytes_make_align_start
160
161
162.Lsameoffset_8bytes_make_align_start:
163	/* attempt to compare bytes not aligned with 8 bytes so that
164	 * rest comparison can run based on 8 bytes alignment.
165	 */
166	andi.   r6,r3,7
167
168	/* Try to compare the first double word which is not 8 bytes aligned:
169	 * load the first double word at (src & ~7UL) and shift left appropriate
170	 * bits before comparision.
171	 */
172	rlwinm  r6,r3,3,26,28
173	beq     .Lsameoffset_8bytes_aligned
174	clrrdi	r3,r3,3
175	clrrdi	r4,r4,3
176	LD	rA,0,r3
177	LD	rB,0,r4
178	sld	rA,rA,r6
179	sld	rB,rB,r6
180	cmpld	cr0,rA,rB
181	srwi	r6,r6,3
182	bne	cr0,.LcmpAB_lightweight
183	subfic  r6,r6,8
184	subf.	r5,r6,r5
185	addi	r3,r3,8
186	addi	r4,r4,8
187	beq	.Lzero
188
189.Lsameoffset_8bytes_aligned:
190	/* now we are aligned with 8 bytes.
191	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
192	 */
193	cmpdi   cr6,r5,31
194	bgt	cr6,.Llong
195
196.Lcmp_lt32bytes:
197	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
198	cmpdi   cr5,r5,7
199	srdi    r0,r5,3
200	ble	cr5,.Lcmp_rest_lt8bytes
201
202	/* handle 8 ~ 31 bytes */
203	clrldi  r5,r5,61
204	mtctr   r0
2052:
206	LD	rA,0,r3
207	LD	rB,0,r4
208	cmpld	cr0,rA,rB
209	addi	r3,r3,8
210	addi	r4,r4,8
211	bne	cr0,.LcmpAB_lightweight
212	bdnz	2b
213
214	cmpwi   r5,0
215	beq	.Lzero
216
217.Lcmp_rest_lt8bytes:
218	/*
219	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
220	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
221	 * page boundary, otherwise we might read past the end of the buffer and
222	 * trigger a page fault. We use 4K as the conservative minimum page
223	 * size. If we detect that case we go to the byte-by-byte loop.
224	 *
225	 * Otherwise the next double word is loaded from s1 and s2, and shifted
226	 * right to compare the appropriate bits.
227	 */
228	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
229	cmpdi	r6,0xff8
230	bgt	.Lshort
231
232	subfic  r6,r5,8
233	slwi	r6,r6,3
234	LD	rA,0,r3
235	LD	rB,0,r4
236	srd	rA,rA,r6
237	srd	rB,rB,r6
238	cmpld	cr0,rA,rB
239	bne	cr0,.LcmpAB_lightweight
240	b	.Lzero
241
242.Lnon_zero:
243	mr	r3,rC
244	blr
245
246.Llong:
247#ifdef CONFIG_ALTIVEC
248BEGIN_FTR_SECTION
249	/* Try to use vmx loop if length is equal or greater than 4K */
250	cmpldi  cr6,r5,VMX_THRESH
251	bge	cr6,.Lsameoffset_vmx_cmp
252END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
253
254.Llong_novmx_cmp:
255#endif
256	/* At least s1 addr is aligned with 8 bytes */
257	li	off8,8
258	li	off16,16
259	li	off24,24
260
261	std	r31,-8(r1)
262	std	r30,-16(r1)
263	std	r29,-24(r1)
264	std	r28,-32(r1)
265	std	r27,-40(r1)
266
267	srdi	r0,r5,5
268	mtctr	r0
269	andi.	r5,r5,31
270
271	LD	rA,0,r3
272	LD	rB,0,r4
273
274	LD	rC,off8,r3
275	LD	rD,off8,r4
276
277	LD	rE,off16,r3
278	LD	rF,off16,r4
279
280	LD	rG,off24,r3
281	LD	rH,off24,r4
282	cmpld	cr0,rA,rB
283
284	addi	r3,r3,32
285	addi	r4,r4,32
286
287	bdz	.Lfirst32
288
289	LD	rA,0,r3
290	LD	rB,0,r4
291	cmpld	cr1,rC,rD
292
293	LD	rC,off8,r3
294	LD	rD,off8,r4
295	cmpld	cr6,rE,rF
296
297	LD	rE,off16,r3
298	LD	rF,off16,r4
299	cmpld	cr7,rG,rH
300	bne	cr0,.LcmpAB
301
302	LD	rG,off24,r3
303	LD	rH,off24,r4
304	cmpld	cr0,rA,rB
305	bne	cr1,.LcmpCD
306
307	addi	r3,r3,32
308	addi	r4,r4,32
309
310	bdz	.Lsecond32
311
312	.balign	16
313
3141:	LD	rA,0,r3
315	LD	rB,0,r4
316	cmpld	cr1,rC,rD
317	bne	cr6,.LcmpEF
318
319	LD	rC,off8,r3
320	LD	rD,off8,r4
321	cmpld	cr6,rE,rF
322	bne	cr7,.LcmpGH
323
324	LD	rE,off16,r3
325	LD	rF,off16,r4
326	cmpld	cr7,rG,rH
327	bne	cr0,.LcmpAB
328
329	LD	rG,off24,r3
330	LD	rH,off24,r4
331	cmpld	cr0,rA,rB
332	bne	cr1,.LcmpCD
333
334	addi	r3,r3,32
335	addi	r4,r4,32
336
337	bdnz	1b
338
339.Lsecond32:
340	cmpld	cr1,rC,rD
341	bne	cr6,.LcmpEF
342
343	cmpld	cr6,rE,rF
344	bne	cr7,.LcmpGH
345
346	cmpld	cr7,rG,rH
347	bne	cr0,.LcmpAB
348
349	bne	cr1,.LcmpCD
350	bne	cr6,.LcmpEF
351	bne	cr7,.LcmpGH
352
353.Ltail:
354	ld	r31,-8(r1)
355	ld	r30,-16(r1)
356	ld	r29,-24(r1)
357	ld	r28,-32(r1)
358	ld	r27,-40(r1)
359
360	cmpdi	r5,0
361	beq	.Lzero
362	b	.Lshort
363
364.Lfirst32:
365	cmpld	cr1,rC,rD
366	cmpld	cr6,rE,rF
367	cmpld	cr7,rG,rH
368
369	bne	cr0,.LcmpAB
370	bne	cr1,.LcmpCD
371	bne	cr6,.LcmpEF
372	bne	cr7,.LcmpGH
373
374	b	.Ltail
375
376.LcmpAB:
377	li	r3,1
378	bgt	cr0,.Lout
379	li	r3,-1
380	b	.Lout
381
382.LcmpCD:
383	li	r3,1
384	bgt	cr1,.Lout
385	li	r3,-1
386	b	.Lout
387
388.LcmpEF:
389	li	r3,1
390	bgt	cr6,.Lout
391	li	r3,-1
392	b	.Lout
393
394.LcmpGH:
395	li	r3,1
396	bgt	cr7,.Lout
397	li	r3,-1
398
399.Lout:
400	ld	r31,-8(r1)
401	ld	r30,-16(r1)
402	ld	r29,-24(r1)
403	ld	r28,-32(r1)
404	ld	r27,-40(r1)
405	blr
406
407.LcmpAB_lightweight:   /* skip NV GPRS restore */
408	li	r3,1
409	bgtlr
410	li	r3,-1
411	blr
412
413#ifdef CONFIG_ALTIVEC
414.Lsameoffset_vmx_cmp:
415	/* Enter with src/dst addrs has the same offset with 8 bytes
416	 * align boundary.
417	 *
418	 * There is an optimization based on following fact: memcmp()
419	 * prones to fail early at the first 32 bytes.
420	 * Before applying VMX instructions which will lead to 32x128bits
421	 * VMX regs load/restore penalty, we compare the first 32 bytes
422	 * so that we can catch the ~80% fail cases.
423	 */
424
425	li	r0,4
426	mtctr	r0
427.Lsameoffset_prechk_32B_loop:
428	LD	rA,0,r3
429	LD	rB,0,r4
430	cmpld	cr0,rA,rB
431	addi	r3,r3,8
432	addi	r4,r4,8
433	bne     cr0,.LcmpAB_lightweight
434	addi	r5,r5,-8
435	bdnz	.Lsameoffset_prechk_32B_loop
436
437	ENTER_VMX_OPS
438	beq     cr1,.Llong_novmx_cmp
439
4403:
441	/* need to check whether r4 has the same offset with r3
442	 * for 16 bytes boundary.
443	 */
444	xor	r0,r3,r4
445	andi.	r0,r0,0xf
446	bne	.Ldiffoffset_vmx_cmp_start
447
448	/* len is no less than 4KB. Need to align with 16 bytes further.
449	 */
450	andi.	rA,r3,8
451	LD	rA,0,r3
452	beq	4f
453	LD	rB,0,r4
454	cmpld	cr0,rA,rB
455	addi	r3,r3,8
456	addi	r4,r4,8
457	addi	r5,r5,-8
458
459	beq	cr0,4f
460	/* save and restore cr0 */
461	mfocrf  r5,128
462	EXIT_VMX_OPS
463	mtocrf  128,r5
464	b	.LcmpAB_lightweight
465
4664:
467	/* compare 32 bytes for each loop */
468	srdi	r0,r5,5
469	mtctr	r0
470	clrldi  r5,r5,59
471	li	off16,16
472
473.balign 16
4745:
475	lvx 	v0,0,r3
476	lvx 	v1,0,r4
477	VCMPEQUD_RC(v0,v0,v1)
478	bnl	cr6,7f
479	lvx 	v0,off16,r3
480	lvx 	v1,off16,r4
481	VCMPEQUD_RC(v0,v0,v1)
482	bnl	cr6,6f
483	addi	r3,r3,32
484	addi	r4,r4,32
485	bdnz	5b
486
487	EXIT_VMX_OPS
488	cmpdi	r5,0
489	beq	.Lzero
490	b	.Lcmp_lt32bytes
491
4926:
493	addi	r3,r3,16
494	addi	r4,r4,16
495
4967:
497	/* diff the last 16 bytes */
498	EXIT_VMX_OPS
499	LD	rA,0,r3
500	LD	rB,0,r4
501	cmpld	cr0,rA,rB
502	li	off8,8
503	bne	cr0,.LcmpAB_lightweight
504
505	LD	rA,off8,r3
506	LD	rB,off8,r4
507	cmpld	cr0,rA,rB
508	bne	cr0,.LcmpAB_lightweight
509	b	.Lzero
510#endif
511
512.Ldiffoffset_8bytes_make_align_start:
513	/* now try to align s1 with 8 bytes */
514	rlwinm  r6,r3,3,26,28
515	beq     .Ldiffoffset_align_s1_8bytes
516
517	clrrdi	r3,r3,3
518	LD	rA,0,r3
519	LD	rB,0,r4  /* unaligned load */
520	sld	rA,rA,r6
521	srd	rA,rA,r6
522	srd	rB,rB,r6
523	cmpld	cr0,rA,rB
524	srwi	r6,r6,3
525	bne	cr0,.LcmpAB_lightweight
526
527	subfic  r6,r6,8
528	subf.	r5,r6,r5
529	addi	r3,r3,8
530	add	r4,r4,r6
531
532	beq	.Lzero
533
534.Ldiffoffset_align_s1_8bytes:
535	/* now s1 is aligned with 8 bytes. */
536#ifdef CONFIG_ALTIVEC
537BEGIN_FTR_SECTION
538	/* only do vmx ops when the size equal or greater than 4K bytes */
539	cmpdi	cr5,r5,VMX_THRESH
540	bge	cr5,.Ldiffoffset_vmx_cmp
541END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
542
543.Ldiffoffset_novmx_cmp:
544#endif
545
546
547	cmpdi   cr5,r5,31
548	ble	cr5,.Lcmp_lt32bytes
549
550#ifdef CONFIG_ALTIVEC
551	b	.Llong_novmx_cmp
552#else
553	b	.Llong
554#endif
555
556#ifdef CONFIG_ALTIVEC
557.Ldiffoffset_vmx_cmp:
558	/* perform a 32 bytes pre-checking before
559	 * enable VMX operations.
560	 */
561	li	r0,4
562	mtctr	r0
563.Ldiffoffset_prechk_32B_loop:
564	LD	rA,0,r3
565	LD	rB,0,r4
566	cmpld	cr0,rA,rB
567	addi	r3,r3,8
568	addi	r4,r4,8
569	bne     cr0,.LcmpAB_lightweight
570	addi	r5,r5,-8
571	bdnz	.Ldiffoffset_prechk_32B_loop
572
573	ENTER_VMX_OPS
574	beq     cr1,.Ldiffoffset_novmx_cmp
575
576.Ldiffoffset_vmx_cmp_start:
577	/* Firstly try to align r3 with 16 bytes */
578	andi.   r6,r3,0xf
579	li	off16,16
580	beq     .Ldiffoffset_vmx_s1_16bytes_align
581
582	LVS	v3,0,r3
583	LVS	v4,0,r4
584
585	lvx     v5,0,r3
586	lvx     v6,0,r4
587	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
588	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
589
590	VCMPEQUB_RC(v7,v9,v10)
591	bnl	cr6,.Ldiffoffset_vmx_diff_found
592
593	subfic  r6,r6,16
594	subf    r5,r6,r5
595	add     r3,r3,r6
596	add     r4,r4,r6
597
598.Ldiffoffset_vmx_s1_16bytes_align:
599	/* now s1 is aligned with 16 bytes */
600	lvx     v6,0,r4
601	LVS	v4,0,r4
602	srdi	r6,r5,5  /* loop for 32 bytes each */
603	clrldi  r5,r5,59
604	mtctr	r6
605
606.balign	16
607.Ldiffoffset_vmx_32bytesloop:
608	/* the first qw of r4 was saved in v6 */
609	lvx	v9,0,r3
610	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
611	VCMPEQUB_RC(v7,v9,v10)
612	vor	v6,v8,v8
613	bnl	cr6,.Ldiffoffset_vmx_diff_found
614
615	addi	r3,r3,16
616	addi	r4,r4,16
617
618	lvx	v9,0,r3
619	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
620	VCMPEQUB_RC(v7,v9,v10)
621	vor	v6,v8,v8
622	bnl	cr6,.Ldiffoffset_vmx_diff_found
623
624	addi	r3,r3,16
625	addi	r4,r4,16
626
627	bdnz	.Ldiffoffset_vmx_32bytesloop
628
629	EXIT_VMX_OPS
630
631	cmpdi	r5,0
632	beq	.Lzero
633	b	.Lcmp_lt32bytes
634
635.Ldiffoffset_vmx_diff_found:
636	EXIT_VMX_OPS
637	/* anyway, the diff will appear in next 16 bytes */
638	li	r5,16
639	b	.Lcmp_lt32bytes
640
641#endif
642EXPORT_SYMBOL(memcmp)
643