1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * arch/alpha/lib/ev6-divide.S 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 61da177e4SLinus Torvalds * 71da177e4SLinus Torvalds * Alpha division.. 81da177e4SLinus Torvalds */ 91da177e4SLinus Torvalds 101da177e4SLinus Torvalds/* 111da177e4SLinus Torvalds * The alpha chip doesn't provide hardware division, so we have to do it 121da177e4SLinus Torvalds * by hand. The compiler expects the functions 131da177e4SLinus Torvalds * 141da177e4SLinus Torvalds * __divqu: 64-bit unsigned long divide 151da177e4SLinus Torvalds * __remqu: 64-bit unsigned long remainder 161da177e4SLinus Torvalds * __divqs/__remqs: signed 64-bit 171da177e4SLinus Torvalds * __divlu/__remlu: unsigned 32-bit 181da177e4SLinus Torvalds * __divls/__remls: signed 32-bit 191da177e4SLinus Torvalds * 201da177e4SLinus Torvalds * These are not normal C functions: instead of the normal 211da177e4SLinus Torvalds * calling sequence, these expect their arguments in registers 221da177e4SLinus Torvalds * $24 and $25, and return the result in $27. Register $28 may 231da177e4SLinus Torvalds * be clobbered (assembly temporary), anything else must be saved. 241da177e4SLinus Torvalds * 251da177e4SLinus Torvalds * In short: painful. 261da177e4SLinus Torvalds * 271da177e4SLinus Torvalds * This is a rather simple bit-at-a-time algorithm: it's very good 281da177e4SLinus Torvalds * at dividing random 64-bit numbers, but the more usual case where 291da177e4SLinus Torvalds * the divisor is small is handled better by the DEC algorithm 301da177e4SLinus Torvalds * using lookup tables. This uses much less memory, though, and is 311da177e4SLinus Torvalds * nicer on the cache.. Besides, I don't know the copyright status 321da177e4SLinus Torvalds * of the DEC code. 331da177e4SLinus Torvalds */ 341da177e4SLinus Torvalds 351da177e4SLinus Torvalds/* 361da177e4SLinus Torvalds * My temporaries: 371da177e4SLinus Torvalds * $0 - current bit 381da177e4SLinus Torvalds * $1 - shifted divisor 391da177e4SLinus Torvalds * $2 - modulus/quotient 401da177e4SLinus Torvalds * 411da177e4SLinus Torvalds * $23 - return address 421da177e4SLinus Torvalds * $24 - dividend 431da177e4SLinus Torvalds * $25 - divisor 441da177e4SLinus Torvalds * 451da177e4SLinus Torvalds * $27 - quotient/modulus 461da177e4SLinus Torvalds * $28 - compare status 471da177e4SLinus Torvalds * 481da177e4SLinus Torvalds * Much of the information about 21264 scheduling/coding comes from: 491da177e4SLinus Torvalds * Compiler Writer's Guide for the Alpha 21264 501da177e4SLinus Torvalds * abbreviated as 'CWG' in other comments here 511da177e4SLinus Torvalds * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 521da177e4SLinus Torvalds * Scheduling notation: 531da177e4SLinus Torvalds * E - either cluster 541da177e4SLinus Torvalds * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 551da177e4SLinus Torvalds * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 561da177e4SLinus Torvalds * Try not to change the actual algorithm if possible for consistency. 571da177e4SLinus Torvalds */ 581da177e4SLinus Torvalds 59*f3c78e94SMasahiro Yamada#include <linux/export.h> 601da177e4SLinus Torvalds#define halt .long 0 611da177e4SLinus Torvalds 621da177e4SLinus Torvalds/* 631da177e4SLinus Torvalds * Select function type and registers 641da177e4SLinus Torvalds */ 651da177e4SLinus Torvalds#define mask $0 661da177e4SLinus Torvalds#define divisor $1 671da177e4SLinus Torvalds#define compare $28 681da177e4SLinus Torvalds#define tmp1 $3 691da177e4SLinus Torvalds#define tmp2 $4 701da177e4SLinus Torvalds 711da177e4SLinus Torvalds#ifdef DIV 721da177e4SLinus Torvalds#define DIV_ONLY(x,y...) x,##y 731da177e4SLinus Torvalds#define MOD_ONLY(x,y...) 741da177e4SLinus Torvalds#define func(x) __div##x 751da177e4SLinus Torvalds#define modulus $2 761da177e4SLinus Torvalds#define quotient $27 771da177e4SLinus Torvalds#define GETSIGN(x) xor $24,$25,x 781da177e4SLinus Torvalds#define STACK 48 791da177e4SLinus Torvalds#else 801da177e4SLinus Torvalds#define DIV_ONLY(x,y...) 811da177e4SLinus Torvalds#define MOD_ONLY(x,y...) x,##y 821da177e4SLinus Torvalds#define func(x) __rem##x 831da177e4SLinus Torvalds#define modulus $27 841da177e4SLinus Torvalds#define quotient $2 851da177e4SLinus Torvalds#define GETSIGN(x) bis $24,$24,x 861da177e4SLinus Torvalds#define STACK 32 871da177e4SLinus Torvalds#endif 881da177e4SLinus Torvalds 891da177e4SLinus Torvalds/* 901da177e4SLinus Torvalds * For 32-bit operations, we need to extend to 64-bit 911da177e4SLinus Torvalds */ 921da177e4SLinus Torvalds#ifdef INTSIZE 931da177e4SLinus Torvalds#define ufunction func(lu) 941da177e4SLinus Torvalds#define sfunction func(l) 951da177e4SLinus Torvalds#define LONGIFY(x) zapnot x,15,x 961da177e4SLinus Torvalds#define SLONGIFY(x) addl x,0,x 971da177e4SLinus Torvalds#else 981da177e4SLinus Torvalds#define ufunction func(qu) 991da177e4SLinus Torvalds#define sfunction func(q) 1001da177e4SLinus Torvalds#define LONGIFY(x) 1011da177e4SLinus Torvalds#define SLONGIFY(x) 1021da177e4SLinus Torvalds#endif 1031da177e4SLinus Torvalds 1041da177e4SLinus Torvalds.set noat 1051da177e4SLinus Torvalds.align 4 1061da177e4SLinus Torvalds.globl ufunction 1071da177e4SLinus Torvalds.ent ufunction 1081da177e4SLinus Torvaldsufunction: 1091da177e4SLinus Torvalds subq $30,STACK,$30 # E : 1101da177e4SLinus Torvalds .frame $30,STACK,$23 1111da177e4SLinus Torvalds .prologue 0 1121da177e4SLinus Torvalds 1131da177e4SLinus Torvalds7: stq $1, 0($30) # L : 1141da177e4SLinus Torvalds bis $25,$25,divisor # E : 1151da177e4SLinus Torvalds stq $2, 8($30) # L : L U L U 1161da177e4SLinus Torvalds 1171da177e4SLinus Torvalds bis $24,$24,modulus # E : 1181da177e4SLinus Torvalds stq $0,16($30) # L : 1191da177e4SLinus Torvalds bis $31,$31,quotient # E : 1201da177e4SLinus Torvalds LONGIFY(divisor) # E : U L L U 1211da177e4SLinus Torvalds 1221da177e4SLinus Torvalds stq tmp1,24($30) # L : 1231da177e4SLinus Torvalds LONGIFY(modulus) # E : 1241da177e4SLinus Torvalds bis $31,1,mask # E : 1251da177e4SLinus Torvalds DIV_ONLY(stq tmp2,32($30)) # L : L U U L 1261da177e4SLinus Torvalds 1271da177e4SLinus Torvalds beq divisor, 9f /* div by zero */ 1281da177e4SLinus Torvalds /* 1291da177e4SLinus Torvalds * In spite of the DIV_ONLY being either a non-instruction 1301da177e4SLinus Torvalds * or an actual stq, the addition of the .align directive 1311da177e4SLinus Torvalds * below ensures that label 1 is going to be nicely aligned 1321da177e4SLinus Torvalds */ 1331da177e4SLinus Torvalds 1341da177e4SLinus Torvalds .align 4 1351da177e4SLinus Torvalds#ifdef INTSIZE 1361da177e4SLinus Torvalds /* 1371da177e4SLinus Torvalds * shift divisor left, using 3-bit shifts for 1381da177e4SLinus Torvalds * 32-bit divides as we can't overflow. Three-bit 1391da177e4SLinus Torvalds * shifts will result in looping three times less 1401da177e4SLinus Torvalds * here, but can result in two loops more later. 1411da177e4SLinus Torvalds * Thus using a large shift isn't worth it (and 1421da177e4SLinus Torvalds * s8add pairs better than a sll..) 1431da177e4SLinus Torvalds */ 1441da177e4SLinus Torvalds1: cmpult divisor,modulus,compare # E : 1451da177e4SLinus Torvalds s8addq divisor,$31,divisor # E : 1461da177e4SLinus Torvalds s8addq mask,$31,mask # E : 1471da177e4SLinus Torvalds bne compare,1b # U : U L U L 1481da177e4SLinus Torvalds#else 1491da177e4SLinus Torvalds1: cmpult divisor,modulus,compare # E : 1501da177e4SLinus Torvalds nop # E : 1511da177e4SLinus Torvalds nop # E : 1521da177e4SLinus Torvalds blt divisor, 2f # U : U L U L 1531da177e4SLinus Torvalds 1541da177e4SLinus Torvalds addq divisor,divisor,divisor # E : 1551da177e4SLinus Torvalds addq mask,mask,mask # E : 1561da177e4SLinus Torvalds unop # E : 1571da177e4SLinus Torvalds bne compare,1b # U : U L U L 1581da177e4SLinus Torvalds#endif 1591da177e4SLinus Torvalds 1601da177e4SLinus Torvalds /* ok, start to go right again.. */ 1611da177e4SLinus Torvalds2: 1621da177e4SLinus Torvalds /* 1631da177e4SLinus Torvalds * Keep things nicely bundled... use a nop instead of not 1641da177e4SLinus Torvalds * having an instruction for DIV_ONLY 1651da177e4SLinus Torvalds */ 1661da177e4SLinus Torvalds#ifdef DIV 1671da177e4SLinus Torvalds DIV_ONLY(addq quotient,mask,tmp2) # E : 1681da177e4SLinus Torvalds#else 1691da177e4SLinus Torvalds nop # E : 1701da177e4SLinus Torvalds#endif 1711da177e4SLinus Torvalds srl mask,1,mask # U : 1721da177e4SLinus Torvalds cmpule divisor,modulus,compare # E : 1731da177e4SLinus Torvalds subq modulus,divisor,tmp1 # E : 1741da177e4SLinus Torvalds 1751da177e4SLinus Torvalds#ifdef DIV 1761da177e4SLinus Torvalds DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot 1771da177e4SLinus Torvalds nop # E : as part of the cmovne 1781da177e4SLinus Torvalds srl divisor,1,divisor # U : 1791da177e4SLinus Torvalds nop # E : L U L U 1801da177e4SLinus Torvalds 1811da177e4SLinus Torvalds nop # E : 1821da177e4SLinus Torvalds cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 1831da177e4SLinus Torvalds nop # E : as part of the cmovne 1841da177e4SLinus Torvalds bne mask,2b # U : U L U L 1851da177e4SLinus Torvalds#else 1861da177e4SLinus Torvalds srl divisor,1,divisor # U : 1871da177e4SLinus Torvalds cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 1881da177e4SLinus Torvalds nop # E : as part of the cmovne 1891da177e4SLinus Torvalds bne mask,2b # U : U L L U 1901da177e4SLinus Torvalds#endif 1911da177e4SLinus Torvalds 1921da177e4SLinus Torvalds9: ldq $1, 0($30) # L : 1931da177e4SLinus Torvalds ldq $2, 8($30) # L : 1941da177e4SLinus Torvalds nop # E : 1951da177e4SLinus Torvalds nop # E : U U L L 1961da177e4SLinus Torvalds 1971da177e4SLinus Torvalds ldq $0,16($30) # L : 1981da177e4SLinus Torvalds ldq tmp1,24($30) # L : 1991da177e4SLinus Torvalds nop # E : 2001da177e4SLinus Torvalds nop # E : 2011da177e4SLinus Torvalds 2021da177e4SLinus Torvalds#ifdef DIV 2031da177e4SLinus Torvalds DIV_ONLY(ldq tmp2,32($30)) # L : 2041da177e4SLinus Torvalds#else 2051da177e4SLinus Torvalds nop # E : 2061da177e4SLinus Torvalds#endif 2071da177e4SLinus Torvalds addq $30,STACK,$30 # E : 2081da177e4SLinus Torvalds ret $31,($23),1 # L0 : L U U L 2091da177e4SLinus Torvalds .end ufunction 21000fc0e0dSAl ViroEXPORT_SYMBOL(ufunction) 2111da177e4SLinus Torvalds 2121da177e4SLinus Torvalds/* 2131da177e4SLinus Torvalds * Uhh.. Ugly signed division. I'd rather not have it at all, but 2141da177e4SLinus Torvalds * it's needed in some circumstances. There are different ways to 2151da177e4SLinus Torvalds * handle this, really. This does: 2161da177e4SLinus Torvalds * -a / b = a / -b = -(a / b) 2171da177e4SLinus Torvalds * -a % b = -(a % b) 2181da177e4SLinus Torvalds * a % -b = a % b 2191da177e4SLinus Torvalds * which is probably not the best solution, but at least should 2201da177e4SLinus Torvalds * have the property that (x/y)*y + (x%y) = x. 2211da177e4SLinus Torvalds */ 2221da177e4SLinus Torvalds.align 4 2231da177e4SLinus Torvalds.globl sfunction 2241da177e4SLinus Torvalds.ent sfunction 2251da177e4SLinus Torvaldssfunction: 2261da177e4SLinus Torvalds subq $30,STACK,$30 # E : 2271da177e4SLinus Torvalds .frame $30,STACK,$23 2281da177e4SLinus Torvalds .prologue 0 2291da177e4SLinus Torvalds bis $24,$25,$28 # E : 2301da177e4SLinus Torvalds SLONGIFY($28) # E : 2311da177e4SLinus Torvalds bge $28,7b # U : 2321da177e4SLinus Torvalds 2331da177e4SLinus Torvalds stq $24,0($30) # L : 2341da177e4SLinus Torvalds subq $31,$24,$28 # E : 2351da177e4SLinus Torvalds stq $25,8($30) # L : 2361da177e4SLinus Torvalds nop # E : U L U L 2371da177e4SLinus Torvalds 2381da177e4SLinus Torvalds cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot 2391da177e4SLinus Torvalds nop # E : as part of the cmov 2401da177e4SLinus Torvalds stq $23,16($30) # L : 2411da177e4SLinus Torvalds subq $31,$25,$28 # E : U L U L 2421da177e4SLinus Torvalds 2431da177e4SLinus Torvalds stq tmp1,24($30) # L : 2441da177e4SLinus Torvalds cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot 2451da177e4SLinus Torvalds nop # E : 2461da177e4SLinus Torvalds bsr $23,ufunction # L0: L U L U 2471da177e4SLinus Torvalds 2481da177e4SLinus Torvalds ldq $24,0($30) # L : 2491da177e4SLinus Torvalds ldq $25,8($30) # L : 2501da177e4SLinus Torvalds GETSIGN($28) # E : 2511da177e4SLinus Torvalds subq $31,$27,tmp1 # E : U U L L 2521da177e4SLinus Torvalds 2531da177e4SLinus Torvalds SLONGIFY($28) # E : 2541da177e4SLinus Torvalds ldq $23,16($30) # L : 2551da177e4SLinus Torvalds cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot 2561da177e4SLinus Torvalds nop # E : U L L U : as part of the cmov 2571da177e4SLinus Torvalds 2581da177e4SLinus Torvalds ldq tmp1,24($30) # L : 2591da177e4SLinus Torvalds nop # E : as part of the cmov 2601da177e4SLinus Torvalds addq $30,STACK,$30 # E : 2611da177e4SLinus Torvalds ret $31,($23),1 # L0 : L U U L 2621da177e4SLinus Torvalds .end sfunction 26300fc0e0dSAl ViroEXPORT_SYMBOL(sfunction) 264