xref: /openbmc/linux/tools/testing/selftests/powerpc/stringloops/memcmp_64.S (revision 05cf4fe738242183f1237f1b3a28b4479348c0a1)
1/*
2 * Author: Anton Blanchard <anton@au.ibm.com>
3 * Copyright 2015 IBM Corporation.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
9 */
10#include <asm/ppc_asm.h>
11#include <asm/export.h>
12#include <asm/ppc-opcode.h>
13
14#define off8	r6
15#define off16	r7
16#define off24	r8
17
18#define rA	r9
19#define rB	r10
20#define rC	r11
21#define rD	r27
22#define rE	r28
23#define rF	r29
24#define rG	r30
25#define rH	r31
26
27#ifdef __LITTLE_ENDIAN__
28#define LH	lhbrx
29#define LW	lwbrx
30#define LD	ldbrx
31#define LVS	lvsr
32#define VPERM(_VRT,_VRA,_VRB,_VRC) \
33	vperm _VRT,_VRB,_VRA,_VRC
34#else
35#define LH	lhzx
36#define LW	lwzx
37#define LD	ldx
38#define LVS	lvsl
39#define VPERM(_VRT,_VRA,_VRB,_VRC) \
40	vperm _VRT,_VRA,_VRB,_VRC
41#endif
42
43#define VMX_THRESH 4096
44#define ENTER_VMX_OPS	\
45	mflr    r0;	\
46	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
47	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
48	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
49	std     r0,16(r1); \
50	stdu    r1,-STACKFRAMESIZE(r1); \
51	bl      enter_vmx_ops; \
52	cmpwi   cr1,r3,0; \
53	ld      r0,STACKFRAMESIZE+16(r1); \
54	ld      r3,STK_REG(R31)(r1); \
55	ld      r4,STK_REG(R30)(r1); \
56	ld      r5,STK_REG(R29)(r1); \
57	addi	r1,r1,STACKFRAMESIZE; \
58	mtlr    r0
59
60#define EXIT_VMX_OPS \
61	mflr    r0; \
62	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
63	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
64	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
65	std     r0,16(r1); \
66	stdu    r1,-STACKFRAMESIZE(r1); \
67	bl      exit_vmx_ops; \
68	ld      r0,STACKFRAMESIZE+16(r1); \
69	ld      r3,STK_REG(R31)(r1); \
70	ld      r4,STK_REG(R30)(r1); \
71	ld      r5,STK_REG(R29)(r1); \
72	addi	r1,r1,STACKFRAMESIZE; \
73	mtlr    r0
74
75/*
76 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
77 * 16 bytes boundary and permute the result with the 1st 16 bytes.
78
79 *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
80 *    ^                                  ^                                 ^
81 * 0xbbbb10                          0xbbbb20                          0xbbb30
82 *                                 ^
83 *                                _vaddr
84 *
85 *
86 * _vmask is the mask generated by LVS
87 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
88 *   for example: 0xyyyyyyyyyyyyy012 for big endian
89 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
90 *   for example: 0x3456789abcdefzzz for big endian
91 * The permute result is saved in _v_res.
92 *   for example: 0x0123456789abcdef for big endian.
93 */
94#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
95        lvx     _v2nd_qw,_vaddr,off16; \
96        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
97
98/*
99 * There are 2 categories for memcmp:
100 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
101 * are named like .Lsameoffset_xxxx
102 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
103 * are named like .Ldiffoffset_xxxx
104 */
105_GLOBAL_TOC(memcmp)
106	cmpdi	cr1,r5,0
107
108	/* Use the short loop if the src/dst addresses are not
109	 * with the same offset of 8 bytes align boundary.
110	 */
111	xor	r6,r3,r4
112	andi.	r6,r6,7
113
114	/* Fall back to short loop if compare at aligned addrs
115	 * with less than 8 bytes.
116	 */
117	cmpdi   cr6,r5,7
118
119	beq	cr1,.Lzero
120	bgt	cr6,.Lno_short
121
122.Lshort:
123	mtctr	r5
1241:	lbz	rA,0(r3)
125	lbz	rB,0(r4)
126	subf.	rC,rB,rA
127	bne	.Lnon_zero
128	bdz	.Lzero
129
130	lbz	rA,1(r3)
131	lbz	rB,1(r4)
132	subf.	rC,rB,rA
133	bne	.Lnon_zero
134	bdz	.Lzero
135
136	lbz	rA,2(r3)
137	lbz	rB,2(r4)
138	subf.	rC,rB,rA
139	bne	.Lnon_zero
140	bdz	.Lzero
141
142	lbz	rA,3(r3)
143	lbz	rB,3(r4)
144	subf.	rC,rB,rA
145	bne	.Lnon_zero
146
147	addi	r3,r3,4
148	addi	r4,r4,4
149
150	bdnz	1b
151
152.Lzero:
153	li	r3,0
154	blr
155
156.Lno_short:
157	dcbt	0,r3
158	dcbt	0,r4
159	bne	.Ldiffoffset_8bytes_make_align_start
160
161
162.Lsameoffset_8bytes_make_align_start:
163	/* attempt to compare bytes not aligned with 8 bytes so that
164	 * rest comparison can run based on 8 bytes alignment.
165	 */
166	andi.   r6,r3,7
167
168	/* Try to compare the first double word which is not 8 bytes aligned:
169	 * load the first double word at (src & ~7UL) and shift left appropriate
170	 * bits before comparision.
171	 */
172	rlwinm  r6,r3,3,26,28
173	beq     .Lsameoffset_8bytes_aligned
174	clrrdi	r3,r3,3
175	clrrdi	r4,r4,3
176	LD	rA,0,r3
177	LD	rB,0,r4
178	sld	rA,rA,r6
179	sld	rB,rB,r6
180	cmpld	cr0,rA,rB
181	srwi	r6,r6,3
182	bne	cr0,.LcmpAB_lightweight
183	subfic  r6,r6,8
184	subf.	r5,r6,r5
185	addi	r3,r3,8
186	addi	r4,r4,8
187	beq	.Lzero
188
189.Lsameoffset_8bytes_aligned:
190	/* now we are aligned with 8 bytes.
191	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
192	 */
193	cmpdi   cr6,r5,31
194	bgt	cr6,.Llong
195
196.Lcmp_lt32bytes:
197	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
198	cmpdi   cr5,r5,7
199	srdi    r0,r5,3
200	ble	cr5,.Lcmp_rest_lt8bytes
201
202	/* handle 8 ~ 31 bytes */
203	clrldi  r5,r5,61
204	mtctr   r0
2052:
206	LD	rA,0,r3
207	LD	rB,0,r4
208	cmpld	cr0,rA,rB
209	addi	r3,r3,8
210	addi	r4,r4,8
211	bne	cr0,.LcmpAB_lightweight
212	bdnz	2b
213
214	cmpwi   r5,0
215	beq	.Lzero
216
217.Lcmp_rest_lt8bytes:
218	/* Here we have only less than 8 bytes to compare with. at least s1
219	 * Address is aligned with 8 bytes.
220	 * The next double words are load and shift right with appropriate
221	 * bits.
222	 */
223	subfic  r6,r5,8
224	slwi	r6,r6,3
225	LD	rA,0,r3
226	LD	rB,0,r4
227	srd	rA,rA,r6
228	srd	rB,rB,r6
229	cmpld	cr0,rA,rB
230	bne	cr0,.LcmpAB_lightweight
231	b	.Lzero
232
233.Lnon_zero:
234	mr	r3,rC
235	blr
236
237.Llong:
238#ifdef CONFIG_ALTIVEC
239BEGIN_FTR_SECTION
240	/* Try to use vmx loop if length is equal or greater than 4K */
241	cmpldi  cr6,r5,VMX_THRESH
242	bge	cr6,.Lsameoffset_vmx_cmp
243END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
244
245.Llong_novmx_cmp:
246#endif
247	/* At least s1 addr is aligned with 8 bytes */
248	li	off8,8
249	li	off16,16
250	li	off24,24
251
252	std	r31,-8(r1)
253	std	r30,-16(r1)
254	std	r29,-24(r1)
255	std	r28,-32(r1)
256	std	r27,-40(r1)
257
258	srdi	r0,r5,5
259	mtctr	r0
260	andi.	r5,r5,31
261
262	LD	rA,0,r3
263	LD	rB,0,r4
264
265	LD	rC,off8,r3
266	LD	rD,off8,r4
267
268	LD	rE,off16,r3
269	LD	rF,off16,r4
270
271	LD	rG,off24,r3
272	LD	rH,off24,r4
273	cmpld	cr0,rA,rB
274
275	addi	r3,r3,32
276	addi	r4,r4,32
277
278	bdz	.Lfirst32
279
280	LD	rA,0,r3
281	LD	rB,0,r4
282	cmpld	cr1,rC,rD
283
284	LD	rC,off8,r3
285	LD	rD,off8,r4
286	cmpld	cr6,rE,rF
287
288	LD	rE,off16,r3
289	LD	rF,off16,r4
290	cmpld	cr7,rG,rH
291	bne	cr0,.LcmpAB
292
293	LD	rG,off24,r3
294	LD	rH,off24,r4
295	cmpld	cr0,rA,rB
296	bne	cr1,.LcmpCD
297
298	addi	r3,r3,32
299	addi	r4,r4,32
300
301	bdz	.Lsecond32
302
303	.balign	16
304
3051:	LD	rA,0,r3
306	LD	rB,0,r4
307	cmpld	cr1,rC,rD
308	bne	cr6,.LcmpEF
309
310	LD	rC,off8,r3
311	LD	rD,off8,r4
312	cmpld	cr6,rE,rF
313	bne	cr7,.LcmpGH
314
315	LD	rE,off16,r3
316	LD	rF,off16,r4
317	cmpld	cr7,rG,rH
318	bne	cr0,.LcmpAB
319
320	LD	rG,off24,r3
321	LD	rH,off24,r4
322	cmpld	cr0,rA,rB
323	bne	cr1,.LcmpCD
324
325	addi	r3,r3,32
326	addi	r4,r4,32
327
328	bdnz	1b
329
330.Lsecond32:
331	cmpld	cr1,rC,rD
332	bne	cr6,.LcmpEF
333
334	cmpld	cr6,rE,rF
335	bne	cr7,.LcmpGH
336
337	cmpld	cr7,rG,rH
338	bne	cr0,.LcmpAB
339
340	bne	cr1,.LcmpCD
341	bne	cr6,.LcmpEF
342	bne	cr7,.LcmpGH
343
344.Ltail:
345	ld	r31,-8(r1)
346	ld	r30,-16(r1)
347	ld	r29,-24(r1)
348	ld	r28,-32(r1)
349	ld	r27,-40(r1)
350
351	cmpdi	r5,0
352	beq	.Lzero
353	b	.Lshort
354
355.Lfirst32:
356	cmpld	cr1,rC,rD
357	cmpld	cr6,rE,rF
358	cmpld	cr7,rG,rH
359
360	bne	cr0,.LcmpAB
361	bne	cr1,.LcmpCD
362	bne	cr6,.LcmpEF
363	bne	cr7,.LcmpGH
364
365	b	.Ltail
366
367.LcmpAB:
368	li	r3,1
369	bgt	cr0,.Lout
370	li	r3,-1
371	b	.Lout
372
373.LcmpCD:
374	li	r3,1
375	bgt	cr1,.Lout
376	li	r3,-1
377	b	.Lout
378
379.LcmpEF:
380	li	r3,1
381	bgt	cr6,.Lout
382	li	r3,-1
383	b	.Lout
384
385.LcmpGH:
386	li	r3,1
387	bgt	cr7,.Lout
388	li	r3,-1
389
390.Lout:
391	ld	r31,-8(r1)
392	ld	r30,-16(r1)
393	ld	r29,-24(r1)
394	ld	r28,-32(r1)
395	ld	r27,-40(r1)
396	blr
397
398.LcmpAB_lightweight:   /* skip NV GPRS restore */
399	li	r3,1
400	bgtlr
401	li	r3,-1
402	blr
403
404#ifdef CONFIG_ALTIVEC
405.Lsameoffset_vmx_cmp:
406	/* Enter with src/dst addrs has the same offset with 8 bytes
407	 * align boundary.
408	 *
409	 * There is an optimization based on following fact: memcmp()
410	 * prones to fail early at the first 32 bytes.
411	 * Before applying VMX instructions which will lead to 32x128bits
412	 * VMX regs load/restore penalty, we compare the first 32 bytes
413	 * so that we can catch the ~80% fail cases.
414	 */
415
416	li	r0,4
417	mtctr	r0
418.Lsameoffset_prechk_32B_loop:
419	LD	rA,0,r3
420	LD	rB,0,r4
421	cmpld	cr0,rA,rB
422	addi	r3,r3,8
423	addi	r4,r4,8
424	bne     cr0,.LcmpAB_lightweight
425	addi	r5,r5,-8
426	bdnz	.Lsameoffset_prechk_32B_loop
427
428	ENTER_VMX_OPS
429	beq     cr1,.Llong_novmx_cmp
430
4313:
432	/* need to check whether r4 has the same offset with r3
433	 * for 16 bytes boundary.
434	 */
435	xor	r0,r3,r4
436	andi.	r0,r0,0xf
437	bne	.Ldiffoffset_vmx_cmp_start
438
439	/* len is no less than 4KB. Need to align with 16 bytes further.
440	 */
441	andi.	rA,r3,8
442	LD	rA,0,r3
443	beq	4f
444	LD	rB,0,r4
445	cmpld	cr0,rA,rB
446	addi	r3,r3,8
447	addi	r4,r4,8
448	addi	r5,r5,-8
449
450	beq	cr0,4f
451	/* save and restore cr0 */
452	mfocrf  r5,128
453	EXIT_VMX_OPS
454	mtocrf  128,r5
455	b	.LcmpAB_lightweight
456
4574:
458	/* compare 32 bytes for each loop */
459	srdi	r0,r5,5
460	mtctr	r0
461	clrldi  r5,r5,59
462	li	off16,16
463
464.balign 16
4655:
466	lvx 	v0,0,r3
467	lvx 	v1,0,r4
468	VCMPEQUD_RC(v0,v0,v1)
469	bnl	cr6,7f
470	lvx 	v0,off16,r3
471	lvx 	v1,off16,r4
472	VCMPEQUD_RC(v0,v0,v1)
473	bnl	cr6,6f
474	addi	r3,r3,32
475	addi	r4,r4,32
476	bdnz	5b
477
478	EXIT_VMX_OPS
479	cmpdi	r5,0
480	beq	.Lzero
481	b	.Lcmp_lt32bytes
482
4836:
484	addi	r3,r3,16
485	addi	r4,r4,16
486
4877:
488	/* diff the last 16 bytes */
489	EXIT_VMX_OPS
490	LD	rA,0,r3
491	LD	rB,0,r4
492	cmpld	cr0,rA,rB
493	li	off8,8
494	bne	cr0,.LcmpAB_lightweight
495
496	LD	rA,off8,r3
497	LD	rB,off8,r4
498	cmpld	cr0,rA,rB
499	bne	cr0,.LcmpAB_lightweight
500	b	.Lzero
501#endif
502
503.Ldiffoffset_8bytes_make_align_start:
504	/* now try to align s1 with 8 bytes */
505	rlwinm  r6,r3,3,26,28
506	beq     .Ldiffoffset_align_s1_8bytes
507
508	clrrdi	r3,r3,3
509	LD	rA,0,r3
510	LD	rB,0,r4  /* unaligned load */
511	sld	rA,rA,r6
512	srd	rA,rA,r6
513	srd	rB,rB,r6
514	cmpld	cr0,rA,rB
515	srwi	r6,r6,3
516	bne	cr0,.LcmpAB_lightweight
517
518	subfic  r6,r6,8
519	subf.	r5,r6,r5
520	addi	r3,r3,8
521	add	r4,r4,r6
522
523	beq	.Lzero
524
525.Ldiffoffset_align_s1_8bytes:
526	/* now s1 is aligned with 8 bytes. */
527#ifdef CONFIG_ALTIVEC
528BEGIN_FTR_SECTION
529	/* only do vmx ops when the size equal or greater than 4K bytes */
530	cmpdi	cr5,r5,VMX_THRESH
531	bge	cr5,.Ldiffoffset_vmx_cmp
532END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
533
534.Ldiffoffset_novmx_cmp:
535#endif
536
537
538	cmpdi   cr5,r5,31
539	ble	cr5,.Lcmp_lt32bytes
540
541#ifdef CONFIG_ALTIVEC
542	b	.Llong_novmx_cmp
543#else
544	b	.Llong
545#endif
546
547#ifdef CONFIG_ALTIVEC
548.Ldiffoffset_vmx_cmp:
549	/* perform a 32 bytes pre-checking before
550	 * enable VMX operations.
551	 */
552	li	r0,4
553	mtctr	r0
554.Ldiffoffset_prechk_32B_loop:
555	LD	rA,0,r3
556	LD	rB,0,r4
557	cmpld	cr0,rA,rB
558	addi	r3,r3,8
559	addi	r4,r4,8
560	bne     cr0,.LcmpAB_lightweight
561	addi	r5,r5,-8
562	bdnz	.Ldiffoffset_prechk_32B_loop
563
564	ENTER_VMX_OPS
565	beq     cr1,.Ldiffoffset_novmx_cmp
566
567.Ldiffoffset_vmx_cmp_start:
568	/* Firstly try to align r3 with 16 bytes */
569	andi.   r6,r3,0xf
570	li	off16,16
571	beq     .Ldiffoffset_vmx_s1_16bytes_align
572
573	LVS	v3,0,r3
574	LVS	v4,0,r4
575
576	lvx     v5,0,r3
577	lvx     v6,0,r4
578	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
579	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
580
581	VCMPEQUB_RC(v7,v9,v10)
582	bnl	cr6,.Ldiffoffset_vmx_diff_found
583
584	subfic  r6,r6,16
585	subf    r5,r6,r5
586	add     r3,r3,r6
587	add     r4,r4,r6
588
589.Ldiffoffset_vmx_s1_16bytes_align:
590	/* now s1 is aligned with 16 bytes */
591	lvx     v6,0,r4
592	LVS	v4,0,r4
593	srdi	r6,r5,5  /* loop for 32 bytes each */
594	clrldi  r5,r5,59
595	mtctr	r6
596
597.balign	16
598.Ldiffoffset_vmx_32bytesloop:
599	/* the first qw of r4 was saved in v6 */
600	lvx	v9,0,r3
601	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
602	VCMPEQUB_RC(v7,v9,v10)
603	vor	v6,v8,v8
604	bnl	cr6,.Ldiffoffset_vmx_diff_found
605
606	addi	r3,r3,16
607	addi	r4,r4,16
608
609	lvx	v9,0,r3
610	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
611	VCMPEQUB_RC(v7,v9,v10)
612	vor	v6,v8,v8
613	bnl	cr6,.Ldiffoffset_vmx_diff_found
614
615	addi	r3,r3,16
616	addi	r4,r4,16
617
618	bdnz	.Ldiffoffset_vmx_32bytesloop
619
620	EXIT_VMX_OPS
621
622	cmpdi	r5,0
623	beq	.Lzero
624	b	.Lcmp_lt32bytes
625
626.Ldiffoffset_vmx_diff_found:
627	EXIT_VMX_OPS
628	/* anyway, the diff will appear in next 16 bytes */
629	li	r5,16
630	b	.Lcmp_lt32bytes
631
632#endif
633EXPORT_SYMBOL(memcmp)
634