xref: /openbmc/linux/arch/alpha/lib/ev6-divide.S (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/ev6-divide.S
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Alpha division..
81da177e4SLinus Torvalds */
91da177e4SLinus Torvalds
101da177e4SLinus Torvalds/*
111da177e4SLinus Torvalds * The alpha chip doesn't provide hardware division, so we have to do it
121da177e4SLinus Torvalds * by hand.  The compiler expects the functions
131da177e4SLinus Torvalds *
141da177e4SLinus Torvalds *	__divqu: 64-bit unsigned long divide
151da177e4SLinus Torvalds *	__remqu: 64-bit unsigned long remainder
161da177e4SLinus Torvalds *	__divqs/__remqs: signed 64-bit
171da177e4SLinus Torvalds *	__divlu/__remlu: unsigned 32-bit
181da177e4SLinus Torvalds *	__divls/__remls: signed 32-bit
191da177e4SLinus Torvalds *
201da177e4SLinus Torvalds * These are not normal C functions: instead of the normal
211da177e4SLinus Torvalds * calling sequence, these expect their arguments in registers
221da177e4SLinus Torvalds * $24 and $25, and return the result in $27. Register $28 may
231da177e4SLinus Torvalds * be clobbered (assembly temporary), anything else must be saved.
241da177e4SLinus Torvalds *
251da177e4SLinus Torvalds * In short: painful.
261da177e4SLinus Torvalds *
271da177e4SLinus Torvalds * This is a rather simple bit-at-a-time algorithm: it's very good
281da177e4SLinus Torvalds * at dividing random 64-bit numbers, but the more usual case where
291da177e4SLinus Torvalds * the divisor is small is handled better by the DEC algorithm
301da177e4SLinus Torvalds * using lookup tables. This uses much less memory, though, and is
311da177e4SLinus Torvalds * nicer on the cache.. Besides, I don't know the copyright status
321da177e4SLinus Torvalds * of the DEC code.
331da177e4SLinus Torvalds */
341da177e4SLinus Torvalds
351da177e4SLinus Torvalds/*
361da177e4SLinus Torvalds * My temporaries:
371da177e4SLinus Torvalds *	$0 - current bit
381da177e4SLinus Torvalds *	$1 - shifted divisor
391da177e4SLinus Torvalds *	$2 - modulus/quotient
401da177e4SLinus Torvalds *
411da177e4SLinus Torvalds *	$23 - return address
421da177e4SLinus Torvalds *	$24 - dividend
431da177e4SLinus Torvalds *	$25 - divisor
441da177e4SLinus Torvalds *
451da177e4SLinus Torvalds *	$27 - quotient/modulus
461da177e4SLinus Torvalds *	$28 - compare status
471da177e4SLinus Torvalds *
481da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from:
491da177e4SLinus Torvalds *	Compiler Writer's Guide for the Alpha 21264
501da177e4SLinus Torvalds *	abbreviated as 'CWG' in other comments here
511da177e4SLinus Torvalds *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
521da177e4SLinus Torvalds * Scheduling notation:
531da177e4SLinus Torvalds *	E	- either cluster
541da177e4SLinus Torvalds *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
551da177e4SLinus Torvalds *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
561da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency.
571da177e4SLinus Torvalds */
581da177e4SLinus Torvalds
59*f3c78e94SMasahiro Yamada#include <linux/export.h>
601da177e4SLinus Torvalds#define halt .long 0
611da177e4SLinus Torvalds
621da177e4SLinus Torvalds/*
631da177e4SLinus Torvalds * Select function type and registers
641da177e4SLinus Torvalds */
651da177e4SLinus Torvalds#define mask	$0
661da177e4SLinus Torvalds#define divisor	$1
671da177e4SLinus Torvalds#define compare $28
681da177e4SLinus Torvalds#define tmp1	$3
691da177e4SLinus Torvalds#define tmp2	$4
701da177e4SLinus Torvalds
711da177e4SLinus Torvalds#ifdef DIV
721da177e4SLinus Torvalds#define DIV_ONLY(x,y...) x,##y
731da177e4SLinus Torvalds#define MOD_ONLY(x,y...)
741da177e4SLinus Torvalds#define func(x) __div##x
751da177e4SLinus Torvalds#define modulus $2
761da177e4SLinus Torvalds#define quotient $27
771da177e4SLinus Torvalds#define GETSIGN(x) xor $24,$25,x
781da177e4SLinus Torvalds#define STACK 48
791da177e4SLinus Torvalds#else
801da177e4SLinus Torvalds#define DIV_ONLY(x,y...)
811da177e4SLinus Torvalds#define MOD_ONLY(x,y...) x,##y
821da177e4SLinus Torvalds#define func(x) __rem##x
831da177e4SLinus Torvalds#define modulus $27
841da177e4SLinus Torvalds#define quotient $2
851da177e4SLinus Torvalds#define GETSIGN(x) bis $24,$24,x
861da177e4SLinus Torvalds#define STACK 32
871da177e4SLinus Torvalds#endif
881da177e4SLinus Torvalds
891da177e4SLinus Torvalds/*
901da177e4SLinus Torvalds * For 32-bit operations, we need to extend to 64-bit
911da177e4SLinus Torvalds */
921da177e4SLinus Torvalds#ifdef INTSIZE
931da177e4SLinus Torvalds#define ufunction func(lu)
941da177e4SLinus Torvalds#define sfunction func(l)
951da177e4SLinus Torvalds#define LONGIFY(x) zapnot x,15,x
961da177e4SLinus Torvalds#define SLONGIFY(x) addl x,0,x
971da177e4SLinus Torvalds#else
981da177e4SLinus Torvalds#define ufunction func(qu)
991da177e4SLinus Torvalds#define sfunction func(q)
1001da177e4SLinus Torvalds#define LONGIFY(x)
1011da177e4SLinus Torvalds#define SLONGIFY(x)
1021da177e4SLinus Torvalds#endif
1031da177e4SLinus Torvalds
1041da177e4SLinus Torvalds.set noat
1051da177e4SLinus Torvalds.align	4
1061da177e4SLinus Torvalds.globl	ufunction
1071da177e4SLinus Torvalds.ent	ufunction
1081da177e4SLinus Torvaldsufunction:
1091da177e4SLinus Torvalds	subq	$30,STACK,$30		# E :
1101da177e4SLinus Torvalds	.frame	$30,STACK,$23
1111da177e4SLinus Torvalds	.prologue 0
1121da177e4SLinus Torvalds
1131da177e4SLinus Torvalds7:	stq	$1, 0($30)		# L :
1141da177e4SLinus Torvalds	bis	$25,$25,divisor		# E :
1151da177e4SLinus Torvalds	stq	$2, 8($30)		# L : L U L U
1161da177e4SLinus Torvalds
1171da177e4SLinus Torvalds	bis	$24,$24,modulus		# E :
1181da177e4SLinus Torvalds	stq	$0,16($30)		# L :
1191da177e4SLinus Torvalds	bis	$31,$31,quotient	# E :
1201da177e4SLinus Torvalds	LONGIFY(divisor)		# E : U L L U
1211da177e4SLinus Torvalds
1221da177e4SLinus Torvalds	stq	tmp1,24($30)		# L :
1231da177e4SLinus Torvalds	LONGIFY(modulus)		# E :
1241da177e4SLinus Torvalds	bis	$31,1,mask		# E :
1251da177e4SLinus Torvalds	DIV_ONLY(stq tmp2,32($30))	# L : L U U L
1261da177e4SLinus Torvalds
1271da177e4SLinus Torvalds	beq	divisor, 9f			/* div by zero */
1281da177e4SLinus Torvalds	/*
1291da177e4SLinus Torvalds	 * In spite of the DIV_ONLY being either a non-instruction
1301da177e4SLinus Torvalds	 * or an actual stq, the addition of the .align directive
1311da177e4SLinus Torvalds	 * below ensures that label 1 is going to be nicely aligned
1321da177e4SLinus Torvalds	 */
1331da177e4SLinus Torvalds
1341da177e4SLinus Torvalds	.align	4
1351da177e4SLinus Torvalds#ifdef INTSIZE
1361da177e4SLinus Torvalds	/*
1371da177e4SLinus Torvalds	 * shift divisor left, using 3-bit shifts for
1381da177e4SLinus Torvalds	 * 32-bit divides as we can't overflow. Three-bit
1391da177e4SLinus Torvalds	 * shifts will result in looping three times less
1401da177e4SLinus Torvalds	 * here, but can result in two loops more later.
1411da177e4SLinus Torvalds	 * Thus using a large shift isn't worth it (and
1421da177e4SLinus Torvalds	 * s8add pairs better than a sll..)
1431da177e4SLinus Torvalds	 */
1441da177e4SLinus Torvalds1:	cmpult	divisor,modulus,compare	# E :
1451da177e4SLinus Torvalds	s8addq	divisor,$31,divisor	# E :
1461da177e4SLinus Torvalds	s8addq	mask,$31,mask		# E :
1471da177e4SLinus Torvalds	bne	compare,1b		# U : U L U L
1481da177e4SLinus Torvalds#else
1491da177e4SLinus Torvalds1:	cmpult	divisor,modulus,compare	# E :
1501da177e4SLinus Torvalds	nop				# E :
1511da177e4SLinus Torvalds	nop				# E :
1521da177e4SLinus Torvalds	blt     divisor, 2f		# U : U L U L
1531da177e4SLinus Torvalds
1541da177e4SLinus Torvalds	addq	divisor,divisor,divisor	# E :
1551da177e4SLinus Torvalds	addq	mask,mask,mask		# E :
1561da177e4SLinus Torvalds	unop				# E :
1571da177e4SLinus Torvalds	bne	compare,1b		# U : U L U L
1581da177e4SLinus Torvalds#endif
1591da177e4SLinus Torvalds
1601da177e4SLinus Torvalds	/* ok, start to go right again.. */
1611da177e4SLinus Torvalds2:
1621da177e4SLinus Torvalds	/*
1631da177e4SLinus Torvalds	 * Keep things nicely bundled... use a nop instead of not
1641da177e4SLinus Torvalds	 * having an instruction for DIV_ONLY
1651da177e4SLinus Torvalds	 */
1661da177e4SLinus Torvalds#ifdef DIV
1671da177e4SLinus Torvalds	DIV_ONLY(addq quotient,mask,tmp2) # E :
1681da177e4SLinus Torvalds#else
1691da177e4SLinus Torvalds	nop				# E :
1701da177e4SLinus Torvalds#endif
1711da177e4SLinus Torvalds	srl	mask,1,mask		# U :
1721da177e4SLinus Torvalds	cmpule	divisor,modulus,compare	# E :
1731da177e4SLinus Torvalds	subq	modulus,divisor,tmp1	# E :
1741da177e4SLinus Torvalds
1751da177e4SLinus Torvalds#ifdef DIV
1761da177e4SLinus Torvalds	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot
1771da177e4SLinus Torvalds	nop				# E : as part of the cmovne
1781da177e4SLinus Torvalds	srl	divisor,1,divisor	# U :
1791da177e4SLinus Torvalds	nop				# E : L U L U
1801da177e4SLinus Torvalds
1811da177e4SLinus Torvalds	nop				# E :
1821da177e4SLinus Torvalds	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
1831da177e4SLinus Torvalds	nop				# E : as part of the cmovne
1841da177e4SLinus Torvalds	bne	mask,2b			# U : U L U L
1851da177e4SLinus Torvalds#else
1861da177e4SLinus Torvalds	srl	divisor,1,divisor	# U :
1871da177e4SLinus Torvalds	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
1881da177e4SLinus Torvalds	nop				# E : as part of the cmovne
1891da177e4SLinus Torvalds	bne	mask,2b			# U : U L L U
1901da177e4SLinus Torvalds#endif
1911da177e4SLinus Torvalds
1921da177e4SLinus Torvalds9:	ldq	$1, 0($30)		# L :
1931da177e4SLinus Torvalds	ldq	$2, 8($30)		# L :
1941da177e4SLinus Torvalds	nop				# E :
1951da177e4SLinus Torvalds	nop				# E : U U L L
1961da177e4SLinus Torvalds
1971da177e4SLinus Torvalds	ldq	$0,16($30)		# L :
1981da177e4SLinus Torvalds	ldq	tmp1,24($30)		# L :
1991da177e4SLinus Torvalds	nop				# E :
2001da177e4SLinus Torvalds	nop				# E :
2011da177e4SLinus Torvalds
2021da177e4SLinus Torvalds#ifdef DIV
2031da177e4SLinus Torvalds	DIV_ONLY(ldq tmp2,32($30))	# L :
2041da177e4SLinus Torvalds#else
2051da177e4SLinus Torvalds	nop				# E :
2061da177e4SLinus Torvalds#endif
2071da177e4SLinus Torvalds	addq	$30,STACK,$30		# E :
2081da177e4SLinus Torvalds	ret	$31,($23),1		# L0 : L U U L
2091da177e4SLinus Torvalds	.end	ufunction
21000fc0e0dSAl ViroEXPORT_SYMBOL(ufunction)
2111da177e4SLinus Torvalds
2121da177e4SLinus Torvalds/*
2131da177e4SLinus Torvalds * Uhh.. Ugly signed division. I'd rather not have it at all, but
2141da177e4SLinus Torvalds * it's needed in some circumstances. There are different ways to
2151da177e4SLinus Torvalds * handle this, really. This does:
2161da177e4SLinus Torvalds * 	-a / b = a / -b = -(a / b)
2171da177e4SLinus Torvalds *	-a % b = -(a % b)
2181da177e4SLinus Torvalds *	a % -b = a % b
2191da177e4SLinus Torvalds * which is probably not the best solution, but at least should
2201da177e4SLinus Torvalds * have the property that (x/y)*y + (x%y) = x.
2211da177e4SLinus Torvalds */
2221da177e4SLinus Torvalds.align 4
2231da177e4SLinus Torvalds.globl	sfunction
2241da177e4SLinus Torvalds.ent	sfunction
2251da177e4SLinus Torvaldssfunction:
2261da177e4SLinus Torvalds	subq	$30,STACK,$30		# E :
2271da177e4SLinus Torvalds	.frame	$30,STACK,$23
2281da177e4SLinus Torvalds	.prologue 0
2291da177e4SLinus Torvalds	bis	$24,$25,$28		# E :
2301da177e4SLinus Torvalds	SLONGIFY($28)			# E :
2311da177e4SLinus Torvalds	bge	$28,7b			# U :
2321da177e4SLinus Torvalds
2331da177e4SLinus Torvalds	stq	$24,0($30)		# L :
2341da177e4SLinus Torvalds	subq	$31,$24,$28		# E :
2351da177e4SLinus Torvalds	stq	$25,8($30)		# L :
2361da177e4SLinus Torvalds	nop				# E : U L U L
2371da177e4SLinus Torvalds
2381da177e4SLinus Torvalds	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot
2391da177e4SLinus Torvalds	nop				# E : as part of the cmov
2401da177e4SLinus Torvalds	stq	$23,16($30)		# L :
2411da177e4SLinus Torvalds	subq	$31,$25,$28		# E : U L U L
2421da177e4SLinus Torvalds
2431da177e4SLinus Torvalds	stq	tmp1,24($30)		# L :
2441da177e4SLinus Torvalds	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot
2451da177e4SLinus Torvalds	nop				# E :
2461da177e4SLinus Torvalds	bsr	$23,ufunction		# L0: L U L U
2471da177e4SLinus Torvalds
2481da177e4SLinus Torvalds	ldq	$24,0($30)		# L :
2491da177e4SLinus Torvalds	ldq	$25,8($30)		# L :
2501da177e4SLinus Torvalds	GETSIGN($28)			# E :
2511da177e4SLinus Torvalds	subq	$31,$27,tmp1		# E : U U L L
2521da177e4SLinus Torvalds
2531da177e4SLinus Torvalds	SLONGIFY($28)			# E :
2541da177e4SLinus Torvalds	ldq	$23,16($30)		# L :
2551da177e4SLinus Torvalds	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot
2561da177e4SLinus Torvalds	nop				# E : U L L U : as part of the cmov
2571da177e4SLinus Torvalds
2581da177e4SLinus Torvalds	ldq	tmp1,24($30)		# L :
2591da177e4SLinus Torvalds	nop				# E : as part of the cmov
2601da177e4SLinus Torvalds	addq	$30,STACK,$30		# E :
2611da177e4SLinus Torvalds	ret	$31,($23),1		# L0 : L U U L
2621da177e4SLinus Torvalds	.end	sfunction
26300fc0e0dSAl ViroEXPORT_SYMBOL(sfunction)
264