xref: /openbmc/linux/arch/alpha/lib/divide.S (revision f3c78e94)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/*
31da177e4SLinus Torvalds * arch/alpha/lib/divide.S
41da177e4SLinus Torvalds *
51da177e4SLinus Torvalds * (C) 1995 Linus Torvalds
61da177e4SLinus Torvalds *
71da177e4SLinus Torvalds * Alpha division..
81da177e4SLinus Torvalds */
91da177e4SLinus Torvalds
101da177e4SLinus Torvalds/*
111da177e4SLinus Torvalds * The alpha chip doesn't provide hardware division, so we have to do it
121da177e4SLinus Torvalds * by hand.  The compiler expects the functions
131da177e4SLinus Torvalds *
141da177e4SLinus Torvalds *	__divqu: 64-bit unsigned long divide
151da177e4SLinus Torvalds *	__remqu: 64-bit unsigned long remainder
161da177e4SLinus Torvalds *	__divqs/__remqs: signed 64-bit
171da177e4SLinus Torvalds *	__divlu/__remlu: unsigned 32-bit
181da177e4SLinus Torvalds *	__divls/__remls: signed 32-bit
191da177e4SLinus Torvalds *
201da177e4SLinus Torvalds * These are not normal C functions: instead of the normal
211da177e4SLinus Torvalds * calling sequence, these expect their arguments in registers
221da177e4SLinus Torvalds * $24 and $25, and return the result in $27. Register $28 may
231da177e4SLinus Torvalds * be clobbered (assembly temporary), anything else must be saved.
241da177e4SLinus Torvalds *
251da177e4SLinus Torvalds * In short: painful.
261da177e4SLinus Torvalds *
271da177e4SLinus Torvalds * This is a rather simple bit-at-a-time algorithm: it's very good
281da177e4SLinus Torvalds * at dividing random 64-bit numbers, but the more usual case where
291da177e4SLinus Torvalds * the divisor is small is handled better by the DEC algorithm
301da177e4SLinus Torvalds * using lookup tables. This uses much less memory, though, and is
311da177e4SLinus Torvalds * nicer on the cache.. Besides, I don't know the copyright status
321da177e4SLinus Torvalds * of the DEC code.
331da177e4SLinus Torvalds */
341da177e4SLinus Torvalds
351da177e4SLinus Torvalds/*
361da177e4SLinus Torvalds * My temporaries:
371da177e4SLinus Torvalds *	$0 - current bit
381da177e4SLinus Torvalds *	$1 - shifted divisor
391da177e4SLinus Torvalds *	$2 - modulus/quotient
401da177e4SLinus Torvalds *
411da177e4SLinus Torvalds *	$23 - return address
421da177e4SLinus Torvalds *	$24 - dividend
431da177e4SLinus Torvalds *	$25 - divisor
441da177e4SLinus Torvalds *
451da177e4SLinus Torvalds *	$27 - quotient/modulus
461da177e4SLinus Torvalds *	$28 - compare status
471da177e4SLinus Torvalds */
481da177e4SLinus Torvalds
49*f3c78e94SMasahiro Yamada#include <linux/export.h>
501da177e4SLinus Torvalds#define halt .long 0
511da177e4SLinus Torvalds
521da177e4SLinus Torvalds/*
531da177e4SLinus Torvalds * Select function type and registers
541da177e4SLinus Torvalds */
551da177e4SLinus Torvalds#define mask	$0
561da177e4SLinus Torvalds#define divisor	$1
571da177e4SLinus Torvalds#define compare $28
581da177e4SLinus Torvalds#define tmp1	$3
591da177e4SLinus Torvalds#define tmp2	$4
601da177e4SLinus Torvalds
611da177e4SLinus Torvalds#ifdef DIV
621da177e4SLinus Torvalds#define DIV_ONLY(x,y...) x,##y
631da177e4SLinus Torvalds#define MOD_ONLY(x,y...)
641da177e4SLinus Torvalds#define func(x) __div##x
651da177e4SLinus Torvalds#define modulus $2
661da177e4SLinus Torvalds#define quotient $27
671da177e4SLinus Torvalds#define GETSIGN(x) xor $24,$25,x
681da177e4SLinus Torvalds#define STACK 48
691da177e4SLinus Torvalds#else
701da177e4SLinus Torvalds#define DIV_ONLY(x,y...)
711da177e4SLinus Torvalds#define MOD_ONLY(x,y...) x,##y
721da177e4SLinus Torvalds#define func(x) __rem##x
731da177e4SLinus Torvalds#define modulus $27
741da177e4SLinus Torvalds#define quotient $2
751da177e4SLinus Torvalds#define GETSIGN(x) bis $24,$24,x
761da177e4SLinus Torvalds#define STACK 32
771da177e4SLinus Torvalds#endif
781da177e4SLinus Torvalds
791da177e4SLinus Torvalds/*
801da177e4SLinus Torvalds * For 32-bit operations, we need to extend to 64-bit
811da177e4SLinus Torvalds */
821da177e4SLinus Torvalds#ifdef INTSIZE
831da177e4SLinus Torvalds#define ufunction func(lu)
841da177e4SLinus Torvalds#define sfunction func(l)
851da177e4SLinus Torvalds#define LONGIFY(x) zapnot x,15,x
861da177e4SLinus Torvalds#define SLONGIFY(x) addl x,0,x
871da177e4SLinus Torvalds#else
881da177e4SLinus Torvalds#define ufunction func(qu)
891da177e4SLinus Torvalds#define sfunction func(q)
901da177e4SLinus Torvalds#define LONGIFY(x)
911da177e4SLinus Torvalds#define SLONGIFY(x)
921da177e4SLinus Torvalds#endif
931da177e4SLinus Torvalds
941da177e4SLinus Torvalds.set noat
951da177e4SLinus Torvalds.align	3
961da177e4SLinus Torvalds.globl	ufunction
971da177e4SLinus Torvalds.ent	ufunction
981da177e4SLinus Torvaldsufunction:
991da177e4SLinus Torvalds	subq	$30,STACK,$30
1001da177e4SLinus Torvalds	.frame	$30,STACK,$23
1011da177e4SLinus Torvalds	.prologue 0
1021da177e4SLinus Torvalds
1031da177e4SLinus Torvalds7:	stq	$1, 0($30)
1041da177e4SLinus Torvalds	bis	$25,$25,divisor
1051da177e4SLinus Torvalds	stq	$2, 8($30)
1061da177e4SLinus Torvalds	bis	$24,$24,modulus
1071da177e4SLinus Torvalds	stq	$0,16($30)
1081da177e4SLinus Torvalds	bis	$31,$31,quotient
1091da177e4SLinus Torvalds	LONGIFY(divisor)
1101da177e4SLinus Torvalds	stq	tmp1,24($30)
1111da177e4SLinus Torvalds	LONGIFY(modulus)
1121da177e4SLinus Torvalds	bis	$31,1,mask
1131da177e4SLinus Torvalds	DIV_ONLY(stq tmp2,32($30))
1141da177e4SLinus Torvalds	beq	divisor, 9f			/* div by zero */
1151da177e4SLinus Torvalds
1161da177e4SLinus Torvalds#ifdef INTSIZE
1171da177e4SLinus Torvalds	/*
1181da177e4SLinus Torvalds	 * shift divisor left, using 3-bit shifts for
1191da177e4SLinus Torvalds	 * 32-bit divides as we can't overflow. Three-bit
1201da177e4SLinus Torvalds	 * shifts will result in looping three times less
1211da177e4SLinus Torvalds	 * here, but can result in two loops more later.
1221da177e4SLinus Torvalds	 * Thus using a large shift isn't worth it (and
1231da177e4SLinus Torvalds	 * s8add pairs better than a sll..)
1241da177e4SLinus Torvalds	 */
1251da177e4SLinus Torvalds1:	cmpult	divisor,modulus,compare
1261da177e4SLinus Torvalds	s8addq	divisor,$31,divisor
1271da177e4SLinus Torvalds	s8addq	mask,$31,mask
1281da177e4SLinus Torvalds	bne	compare,1b
1291da177e4SLinus Torvalds#else
1301da177e4SLinus Torvalds1:	cmpult	divisor,modulus,compare
1311da177e4SLinus Torvalds	blt     divisor, 2f
1321da177e4SLinus Torvalds	addq	divisor,divisor,divisor
1331da177e4SLinus Torvalds	addq	mask,mask,mask
1341da177e4SLinus Torvalds	bne	compare,1b
1351da177e4SLinus Torvalds	unop
1361da177e4SLinus Torvalds#endif
1371da177e4SLinus Torvalds
1381da177e4SLinus Torvalds	/* ok, start to go right again.. */
1391da177e4SLinus Torvalds2:	DIV_ONLY(addq quotient,mask,tmp2)
1401da177e4SLinus Torvalds	srl	mask,1,mask
1411da177e4SLinus Torvalds	cmpule	divisor,modulus,compare
1421da177e4SLinus Torvalds	subq	modulus,divisor,tmp1
1431da177e4SLinus Torvalds	DIV_ONLY(cmovne compare,tmp2,quotient)
1441da177e4SLinus Torvalds	srl	divisor,1,divisor
1451da177e4SLinus Torvalds	cmovne	compare,tmp1,modulus
1461da177e4SLinus Torvalds	bne	mask,2b
1471da177e4SLinus Torvalds
1481da177e4SLinus Torvalds9:	ldq	$1, 0($30)
1491da177e4SLinus Torvalds	ldq	$2, 8($30)
1501da177e4SLinus Torvalds	ldq	$0,16($30)
1511da177e4SLinus Torvalds	ldq	tmp1,24($30)
1521da177e4SLinus Torvalds	DIV_ONLY(ldq tmp2,32($30))
1531da177e4SLinus Torvalds	addq	$30,STACK,$30
1541da177e4SLinus Torvalds	ret	$31,($23),1
1551da177e4SLinus Torvalds	.end	ufunction
15600fc0e0dSAl ViroEXPORT_SYMBOL(ufunction)
1571da177e4SLinus Torvalds
1581da177e4SLinus Torvalds/*
1591da177e4SLinus Torvalds * Uhh.. Ugly signed division. I'd rather not have it at all, but
1601da177e4SLinus Torvalds * it's needed in some circumstances. There are different ways to
1611da177e4SLinus Torvalds * handle this, really. This does:
1621da177e4SLinus Torvalds * 	-a / b = a / -b = -(a / b)
1631da177e4SLinus Torvalds *	-a % b = -(a % b)
1641da177e4SLinus Torvalds *	a % -b = a % b
1651da177e4SLinus Torvalds * which is probably not the best solution, but at least should
1661da177e4SLinus Torvalds * have the property that (x/y)*y + (x%y) = x.
1671da177e4SLinus Torvalds */
1681da177e4SLinus Torvalds.align 3
1691da177e4SLinus Torvalds.globl	sfunction
1701da177e4SLinus Torvalds.ent	sfunction
1711da177e4SLinus Torvaldssfunction:
1721da177e4SLinus Torvalds	subq	$30,STACK,$30
1731da177e4SLinus Torvalds	.frame	$30,STACK,$23
1741da177e4SLinus Torvalds	.prologue 0
1751da177e4SLinus Torvalds	bis	$24,$25,$28
1761da177e4SLinus Torvalds	SLONGIFY($28)
1771da177e4SLinus Torvalds	bge	$28,7b
1781da177e4SLinus Torvalds	stq	$24,0($30)
1791da177e4SLinus Torvalds	subq	$31,$24,$28
1801da177e4SLinus Torvalds	stq	$25,8($30)
1811da177e4SLinus Torvalds	cmovlt	$24,$28,$24	/* abs($24) */
1821da177e4SLinus Torvalds	stq	$23,16($30)
1831da177e4SLinus Torvalds	subq	$31,$25,$28
1841da177e4SLinus Torvalds	stq	tmp1,24($30)
1851da177e4SLinus Torvalds	cmovlt	$25,$28,$25	/* abs($25) */
1861da177e4SLinus Torvalds	unop
1871da177e4SLinus Torvalds	bsr	$23,ufunction
1881da177e4SLinus Torvalds	ldq	$24,0($30)
1891da177e4SLinus Torvalds	ldq	$25,8($30)
1901da177e4SLinus Torvalds	GETSIGN($28)
1911da177e4SLinus Torvalds	subq	$31,$27,tmp1
1921da177e4SLinus Torvalds	SLONGIFY($28)
1931da177e4SLinus Torvalds	ldq	$23,16($30)
1941da177e4SLinus Torvalds	cmovlt	$28,tmp1,$27
1951da177e4SLinus Torvalds	ldq	tmp1,24($30)
1961da177e4SLinus Torvalds	addq	$30,STACK,$30
1971da177e4SLinus Torvalds	ret	$31,($23),1
1981da177e4SLinus Torvalds	.end	sfunction
19900fc0e0dSAl ViroEXPORT_SYMBOL(sfunction)
200