xref: /openbmc/linux/arch/ia64/lib/strlen.S (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds *
41da177e4SLinus Torvalds * Optimized version of the standard strlen() function
51da177e4SLinus Torvalds *
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Inputs:
81da177e4SLinus Torvalds *	in0	address of string
91da177e4SLinus Torvalds *
101da177e4SLinus Torvalds * Outputs:
111da177e4SLinus Torvalds *	ret0	the number of characters in the string (0 if empty string)
121da177e4SLinus Torvalds *	does not count the \0
131da177e4SLinus Torvalds *
141da177e4SLinus Torvalds * Copyright (C) 1999, 2001 Hewlett-Packard Co
151da177e4SLinus Torvalds *	Stephane Eranian <eranian@hpl.hp.com>
161da177e4SLinus Torvalds *
171da177e4SLinus Torvalds * 09/24/99 S.Eranian add speculation recovery code
181da177e4SLinus Torvalds */
191da177e4SLinus Torvalds
20*ab03e604SMasahiro Yamada#include <linux/export.h>
211da177e4SLinus Torvalds#include <asm/asmmacro.h>
221da177e4SLinus Torvalds
231da177e4SLinus Torvalds//
241da177e4SLinus Torvalds//
251da177e4SLinus Torvalds// This is an enhanced version of the basic strlen. it includes a combination
261da177e4SLinus Torvalds// of compute zero index (czx), parallel comparisons, speculative loads and
271da177e4SLinus Torvalds// loop unroll using rotating registers.
281da177e4SLinus Torvalds//
291da177e4SLinus Torvalds// General Ideas about the algorithm:
301da177e4SLinus Torvalds//	  The goal is to look at the string in chunks of 8 bytes.
311da177e4SLinus Torvalds//	  so we need to do a few extra checks at the beginning because the
321da177e4SLinus Torvalds//	  string may not be 8-byte aligned. In this case we load the 8byte
331da177e4SLinus Torvalds//	  quantity which includes the start of the string and mask the unused
341da177e4SLinus Torvalds//	  bytes with 0xff to avoid confusing czx.
351da177e4SLinus Torvalds//	  We use speculative loads and software pipelining to hide memory
361da177e4SLinus Torvalds//	  latency and do read ahead safely. This way we defer any exception.
371da177e4SLinus Torvalds//
381da177e4SLinus Torvalds//	  Because we don't want the kernel to be relying on particular
391da177e4SLinus Torvalds//	  settings of the DCR register, we provide recovery code in case
401da177e4SLinus Torvalds//	  speculation fails. The recovery code is going to "redo" the work using
411da177e4SLinus Torvalds//	  only normal loads. If we still get a fault then we generate a
421da177e4SLinus Torvalds//	  kernel panic. Otherwise we return the strlen as usual.
431da177e4SLinus Torvalds//
441da177e4SLinus Torvalds//	  The fact that speculation may fail can be caused, for instance, by
451da177e4SLinus Torvalds//	  the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
461da177e4SLinus Torvalds//	  a NaT bit will be set if the translation is not present. The normal
471da177e4SLinus Torvalds//	  load, on the other hand, will cause the translation to be inserted
481da177e4SLinus Torvalds//	  if the mapping exists.
491da177e4SLinus Torvalds//
501da177e4SLinus Torvalds//	  It should be noted that we execute recovery code only when we need
511da177e4SLinus Torvalds//	  to use the data that has been speculatively loaded: we don't execute
521da177e4SLinus Torvalds//	  recovery code on pure read ahead data.
531da177e4SLinus Torvalds//
541da177e4SLinus Torvalds// Remarks:
551da177e4SLinus Torvalds//	- the cmp r0,r0 is used as a fast way to initialize a predicate
561da177e4SLinus Torvalds//	  register to 1. This is required to make sure that we get the parallel
571da177e4SLinus Torvalds//	  compare correct.
581da177e4SLinus Torvalds//
591da177e4SLinus Torvalds//	- we don't use the epilogue counter to exit the loop but we need to set
601da177e4SLinus Torvalds//	  it to zero beforehand.
611da177e4SLinus Torvalds//
621da177e4SLinus Torvalds//	- after the loop we must test for Nat values because neither the
631da177e4SLinus Torvalds//	  czx nor cmp instruction raise a NaT consumption fault. We must be
641da177e4SLinus Torvalds//	  careful not to look too far for a Nat for which we don't care.
651da177e4SLinus Torvalds//	  For instance we don't need to look at a NaT in val2 if the zero byte
661da177e4SLinus Torvalds//	  was in val1.
671da177e4SLinus Torvalds//
681da177e4SLinus Torvalds//	- Clearly performance tuning is required.
691da177e4SLinus Torvalds//
701da177e4SLinus Torvalds//
711da177e4SLinus Torvalds//
721da177e4SLinus Torvalds#define saved_pfs	r11
731da177e4SLinus Torvalds#define	tmp		r10
741da177e4SLinus Torvalds#define base		r16
751da177e4SLinus Torvalds#define orig		r17
761da177e4SLinus Torvalds#define saved_pr	r18
771da177e4SLinus Torvalds#define src		r19
781da177e4SLinus Torvalds#define mask		r20
791da177e4SLinus Torvalds#define val		r21
801da177e4SLinus Torvalds#define val1		r22
811da177e4SLinus Torvalds#define val2		r23
821da177e4SLinus Torvalds
831da177e4SLinus TorvaldsGLOBAL_ENTRY(strlen)
841da177e4SLinus Torvalds	.prologue
851da177e4SLinus Torvalds	.save ar.pfs, saved_pfs
861da177e4SLinus Torvalds	alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
871da177e4SLinus Torvalds
881da177e4SLinus Torvalds	.rotr v[2], w[2]	// declares our 4 aliases
891da177e4SLinus Torvalds
901da177e4SLinus Torvalds	extr.u tmp=in0,0,3	// tmp=least significant 3 bits
911da177e4SLinus Torvalds	mov orig=in0		// keep trackof initial byte address
921da177e4SLinus Torvalds	dep src=0,in0,0,3	// src=8byte-aligned in0 address
931da177e4SLinus Torvalds	.save pr, saved_pr
941da177e4SLinus Torvalds	mov saved_pr=pr		// preserve predicates (rotation)
951da177e4SLinus Torvalds	;;
961da177e4SLinus Torvalds
971da177e4SLinus Torvalds	.body
981da177e4SLinus Torvalds
991da177e4SLinus Torvalds	ld8 v[1]=[src],8	// must not speculate: can fail here
1001da177e4SLinus Torvalds	shl tmp=tmp,3		// multiply by 8bits/byte
1011da177e4SLinus Torvalds	mov mask=-1		// our mask
1021da177e4SLinus Torvalds	;;
1031da177e4SLinus Torvalds	ld8.s w[1]=[src],8	// speculatively load next
1041da177e4SLinus Torvalds	cmp.eq p6,p0=r0,r0	// sets p6 to true for cmp.and
1051da177e4SLinus Torvalds	sub tmp=64,tmp		// how many bits to shift our mask on the right
1061da177e4SLinus Torvalds	;;
1071da177e4SLinus Torvalds	shr.u	mask=mask,tmp	// zero enough bits to hold v[1] valuable part
1081da177e4SLinus Torvalds	mov ar.ec=r0		// clear epilogue counter (saved in ar.pfs)
1091da177e4SLinus Torvalds	;;
1101da177e4SLinus Torvalds	add base=-16,src	// keep track of aligned base
1111da177e4SLinus Torvalds	or v[1]=v[1],mask	// now we have a safe initial byte pattern
1121da177e4SLinus Torvalds	;;
1131da177e4SLinus Torvalds1:
1141da177e4SLinus Torvalds	ld8.s v[0]=[src],8	// speculatively load next
1151da177e4SLinus Torvalds	czx1.r val1=v[1]	// search 0 byte from right
1161da177e4SLinus Torvalds	czx1.r val2=w[1]	// search 0 byte from right following 8bytes
1171da177e4SLinus Torvalds	;;
1181da177e4SLinus Torvalds	ld8.s w[0]=[src],8	// speculatively load next to next
1191da177e4SLinus Torvalds	cmp.eq.and p6,p0=8,val1	// p6 = p6 and val1==8
1201da177e4SLinus Torvalds	cmp.eq.and p6,p0=8,val2	// p6 = p6 and mask==8
1211da177e4SLinus Torvalds(p6)	br.wtop.dptk 1b		// loop until p6 == 0
1221da177e4SLinus Torvalds	;;
1231da177e4SLinus Torvalds	//
1241da177e4SLinus Torvalds	// We must return try the recovery code iff
1251da177e4SLinus Torvalds	// val1_is_nat || (val1==8 && val2_is_nat)
1261da177e4SLinus Torvalds	//
1271da177e4SLinus Torvalds	// XXX Fixme
1281da177e4SLinus Torvalds	//	- there must be a better way of doing the test
1291da177e4SLinus Torvalds	//
1301da177e4SLinus Torvalds	cmp.eq  p8,p9=8,val1	// p6 = val1 had zero (disambiguate)
1311da177e4SLinus Torvalds	tnat.nz p6,p7=val1	// test NaT on val1
1321da177e4SLinus Torvalds(p6)	br.cond.spnt .recover	// jump to recovery if val1 is NaT
1331da177e4SLinus Torvalds	;;
1341da177e4SLinus Torvalds	//
1351da177e4SLinus Torvalds	// if we come here p7 is true, i.e., initialized for // cmp
1361da177e4SLinus Torvalds	//
1371da177e4SLinus Torvalds	cmp.eq.and  p7,p0=8,val1// val1==8?
1381da177e4SLinus Torvalds	tnat.nz.and p7,p0=val2	// test NaT if val2
1391da177e4SLinus Torvalds(p7)	br.cond.spnt .recover	// jump to recovery if val2 is NaT
1401da177e4SLinus Torvalds	;;
1411da177e4SLinus Torvalds(p8)	mov val1=val2		// the other test got us out of the loop
1421da177e4SLinus Torvalds(p8)	adds src=-16,src	// correct position when 3 ahead
1431da177e4SLinus Torvalds(p9)	adds src=-24,src	// correct position when 4 ahead
1441da177e4SLinus Torvalds	;;
1451da177e4SLinus Torvalds	sub ret0=src,orig	// distance from base
1461da177e4SLinus Torvalds	sub tmp=8,val1		// which byte in word
1471da177e4SLinus Torvalds	mov pr=saved_pr,0xffffffffffff0000
1481da177e4SLinus Torvalds	;;
1491da177e4SLinus Torvalds	sub ret0=ret0,tmp	// adjust
1501da177e4SLinus Torvalds	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
1511da177e4SLinus Torvalds	br.ret.sptk.many rp	// end of normal execution
1521da177e4SLinus Torvalds
1531da177e4SLinus Torvalds	//
1541da177e4SLinus Torvalds	// Outlined recovery code when speculation failed
1551da177e4SLinus Torvalds	//
1561da177e4SLinus Torvalds	// This time we don't use speculation and rely on the normal exception
1571da177e4SLinus Torvalds	// mechanism. that's why the loop is not as good as the previous one
1581da177e4SLinus Torvalds	// because read ahead is not possible
1591da177e4SLinus Torvalds	//
1601da177e4SLinus Torvalds	// IMPORTANT:
1611da177e4SLinus Torvalds	// Please note that in the case of strlen() as opposed to strlen_user()
1621da177e4SLinus Torvalds	// we don't use the exception mechanism, as this function is not
1631da177e4SLinus Torvalds	// supposed to fail. If that happens it means we have a bug and the
1641da177e4SLinus Torvalds	// code will cause of kernel fault.
1651da177e4SLinus Torvalds	//
1661da177e4SLinus Torvalds	// XXX Fixme
1671da177e4SLinus Torvalds	//	- today we restart from the beginning of the string instead
1681da177e4SLinus Torvalds	//	  of trying to continue where we left off.
1691da177e4SLinus Torvalds	//
1701da177e4SLinus Torvalds.recover:
1711da177e4SLinus Torvalds	ld8 val=[base],8	// will fail if unrecoverable fault
1721da177e4SLinus Torvalds	;;
1731da177e4SLinus Torvalds	or val=val,mask		// remask first bytes
1741da177e4SLinus Torvalds	cmp.eq p0,p6=r0,r0	// nullify first ld8 in loop
1751da177e4SLinus Torvalds	;;
1761da177e4SLinus Torvalds	//
1771da177e4SLinus Torvalds	// ar.ec is still zero here
1781da177e4SLinus Torvalds	//
1791da177e4SLinus Torvalds2:
1801da177e4SLinus Torvalds(p6)	ld8 val=[base],8	// will fail if unrecoverable fault
1811da177e4SLinus Torvalds	;;
1821da177e4SLinus Torvalds	czx1.r val1=val		// search 0 byte from right
1831da177e4SLinus Torvalds	;;
1841da177e4SLinus Torvalds	cmp.eq p6,p0=8,val1	// val1==8 ?
1851da177e4SLinus Torvalds(p6)	br.wtop.dptk 2b		// loop until p6 == 0
1861da177e4SLinus Torvalds	;;			// (avoid WAW on p63)
1871da177e4SLinus Torvalds	sub ret0=base,orig	// distance from base
1881da177e4SLinus Torvalds	sub tmp=8,val1
1891da177e4SLinus Torvalds	mov pr=saved_pr,0xffffffffffff0000
1901da177e4SLinus Torvalds	;;
1911da177e4SLinus Torvalds	sub ret0=ret0,tmp	// length=now - back -1
1921da177e4SLinus Torvalds	mov ar.pfs=saved_pfs	// because of ar.ec, restore no matter what
1931da177e4SLinus Torvalds	br.ret.sptk.many rp	// end of successful recovery code
1941da177e4SLinus TorvaldsEND(strlen)
195e007c533SAl ViroEXPORT_SYMBOL(strlen)
196