1/* 2 * arch/alpha/lib/ev6-divide.S 3 * 4 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 5 * 6 * Alpha division.. 7 */ 8 9/* 10 * The alpha chip doesn't provide hardware division, so we have to do it 11 * by hand. The compiler expects the functions 12 * 13 * __divqu: 64-bit unsigned long divide 14 * __remqu: 64-bit unsigned long remainder 15 * __divqs/__remqs: signed 64-bit 16 * __divlu/__remlu: unsigned 32-bit 17 * __divls/__remls: signed 32-bit 18 * 19 * These are not normal C functions: instead of the normal 20 * calling sequence, these expect their arguments in registers 21 * $24 and $25, and return the result in $27. Register $28 may 22 * be clobbered (assembly temporary), anything else must be saved. 23 * 24 * In short: painful. 25 * 26 * This is a rather simple bit-at-a-time algorithm: it's very good 27 * at dividing random 64-bit numbers, but the more usual case where 28 * the divisor is small is handled better by the DEC algorithm 29 * using lookup tables. This uses much less memory, though, and is 30 * nicer on the cache.. Besides, I don't know the copyright status 31 * of the DEC code. 32 */ 33 34/* 35 * My temporaries: 36 * $0 - current bit 37 * $1 - shifted divisor 38 * $2 - modulus/quotient 39 * 40 * $23 - return address 41 * $24 - dividend 42 * $25 - divisor 43 * 44 * $27 - quotient/modulus 45 * $28 - compare status 46 * 47 * Much of the information about 21264 scheduling/coding comes from: 48 * Compiler Writer's Guide for the Alpha 21264 49 * abbreviated as 'CWG' in other comments here 50 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 51 * Scheduling notation: 52 * E - either cluster 53 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 54 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 55 * Try not to change the actual algorithm if possible for consistency. 56 */ 57 58#include <asm/export.h> 59#define halt .long 0 60 61/* 62 * Select function type and registers 63 */ 64#define mask $0 65#define divisor $1 66#define compare $28 67#define tmp1 $3 68#define tmp2 $4 69 70#ifdef DIV 71#define DIV_ONLY(x,y...) x,##y 72#define MOD_ONLY(x,y...) 73#define func(x) __div##x 74#define modulus $2 75#define quotient $27 76#define GETSIGN(x) xor $24,$25,x 77#define STACK 48 78#else 79#define DIV_ONLY(x,y...) 80#define MOD_ONLY(x,y...) x,##y 81#define func(x) __rem##x 82#define modulus $27 83#define quotient $2 84#define GETSIGN(x) bis $24,$24,x 85#define STACK 32 86#endif 87 88/* 89 * For 32-bit operations, we need to extend to 64-bit 90 */ 91#ifdef INTSIZE 92#define ufunction func(lu) 93#define sfunction func(l) 94#define LONGIFY(x) zapnot x,15,x 95#define SLONGIFY(x) addl x,0,x 96#else 97#define ufunction func(qu) 98#define sfunction func(q) 99#define LONGIFY(x) 100#define SLONGIFY(x) 101#endif 102 103.set noat 104.align 4 105.globl ufunction 106.ent ufunction 107ufunction: 108 subq $30,STACK,$30 # E : 109 .frame $30,STACK,$23 110 .prologue 0 111 1127: stq $1, 0($30) # L : 113 bis $25,$25,divisor # E : 114 stq $2, 8($30) # L : L U L U 115 116 bis $24,$24,modulus # E : 117 stq $0,16($30) # L : 118 bis $31,$31,quotient # E : 119 LONGIFY(divisor) # E : U L L U 120 121 stq tmp1,24($30) # L : 122 LONGIFY(modulus) # E : 123 bis $31,1,mask # E : 124 DIV_ONLY(stq tmp2,32($30)) # L : L U U L 125 126 beq divisor, 9f /* div by zero */ 127 /* 128 * In spite of the DIV_ONLY being either a non-instruction 129 * or an actual stq, the addition of the .align directive 130 * below ensures that label 1 is going to be nicely aligned 131 */ 132 133 .align 4 134#ifdef INTSIZE 135 /* 136 * shift divisor left, using 3-bit shifts for 137 * 32-bit divides as we can't overflow. Three-bit 138 * shifts will result in looping three times less 139 * here, but can result in two loops more later. 140 * Thus using a large shift isn't worth it (and 141 * s8add pairs better than a sll..) 142 */ 1431: cmpult divisor,modulus,compare # E : 144 s8addq divisor,$31,divisor # E : 145 s8addq mask,$31,mask # E : 146 bne compare,1b # U : U L U L 147#else 1481: cmpult divisor,modulus,compare # E : 149 nop # E : 150 nop # E : 151 blt divisor, 2f # U : U L U L 152 153 addq divisor,divisor,divisor # E : 154 addq mask,mask,mask # E : 155 unop # E : 156 bne compare,1b # U : U L U L 157#endif 158 159 /* ok, start to go right again.. */ 1602: 161 /* 162 * Keep things nicely bundled... use a nop instead of not 163 * having an instruction for DIV_ONLY 164 */ 165#ifdef DIV 166 DIV_ONLY(addq quotient,mask,tmp2) # E : 167#else 168 nop # E : 169#endif 170 srl mask,1,mask # U : 171 cmpule divisor,modulus,compare # E : 172 subq modulus,divisor,tmp1 # E : 173 174#ifdef DIV 175 DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot 176 nop # E : as part of the cmovne 177 srl divisor,1,divisor # U : 178 nop # E : L U L U 179 180 nop # E : 181 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 182 nop # E : as part of the cmovne 183 bne mask,2b # U : U L U L 184#else 185 srl divisor,1,divisor # U : 186 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 187 nop # E : as part of the cmovne 188 bne mask,2b # U : U L L U 189#endif 190 1919: ldq $1, 0($30) # L : 192 ldq $2, 8($30) # L : 193 nop # E : 194 nop # E : U U L L 195 196 ldq $0,16($30) # L : 197 ldq tmp1,24($30) # L : 198 nop # E : 199 nop # E : 200 201#ifdef DIV 202 DIV_ONLY(ldq tmp2,32($30)) # L : 203#else 204 nop # E : 205#endif 206 addq $30,STACK,$30 # E : 207 ret $31,($23),1 # L0 : L U U L 208 .end ufunction 209EXPORT_SYMBOL(ufunction) 210 211/* 212 * Uhh.. Ugly signed division. I'd rather not have it at all, but 213 * it's needed in some circumstances. There are different ways to 214 * handle this, really. This does: 215 * -a / b = a / -b = -(a / b) 216 * -a % b = -(a % b) 217 * a % -b = a % b 218 * which is probably not the best solution, but at least should 219 * have the property that (x/y)*y + (x%y) = x. 220 */ 221.align 4 222.globl sfunction 223.ent sfunction 224sfunction: 225 subq $30,STACK,$30 # E : 226 .frame $30,STACK,$23 227 .prologue 0 228 bis $24,$25,$28 # E : 229 SLONGIFY($28) # E : 230 bge $28,7b # U : 231 232 stq $24,0($30) # L : 233 subq $31,$24,$28 # E : 234 stq $25,8($30) # L : 235 nop # E : U L U L 236 237 cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot 238 nop # E : as part of the cmov 239 stq $23,16($30) # L : 240 subq $31,$25,$28 # E : U L U L 241 242 stq tmp1,24($30) # L : 243 cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot 244 nop # E : 245 bsr $23,ufunction # L0: L U L U 246 247 ldq $24,0($30) # L : 248 ldq $25,8($30) # L : 249 GETSIGN($28) # E : 250 subq $31,$27,tmp1 # E : U U L L 251 252 SLONGIFY($28) # E : 253 ldq $23,16($30) # L : 254 cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot 255 nop # E : U L L U : as part of the cmov 256 257 ldq tmp1,24($30) # L : 258 nop # E : as part of the cmov 259 addq $30,STACK,$30 # E : 260 ret $31,($23),1 # L0 : L U U L 261 .end sfunction 262EXPORT_SYMBOL(sfunction) 263