1 #ifndef __ASM_ARM_DIV64 2 #define __ASM_ARM_DIV64 3 4 #include <linux/types.h> 5 #include <asm/compiler.h> 6 7 /* 8 * The semantics of do_div() are: 9 * 10 * uint32_t do_div(uint64_t *n, uint32_t base) 11 * { 12 * uint32_t remainder = *n % base; 13 * *n = *n / base; 14 * return remainder; 15 * } 16 * 17 * In other words, a 64-bit dividend with a 32-bit divisor producing 18 * a 64-bit result and a 32-bit remainder. To accomplish this optimally 19 * we call a special __do_div64 helper with completely non standard 20 * calling convention for arguments and results (beware). 21 */ 22 23 #ifdef __ARMEB__ 24 #define __xh "r0" 25 #define __xl "r1" 26 #else 27 #define __xl "r0" 28 #define __xh "r1" 29 #endif 30 31 #define __do_div_asm(n, base) \ 32 ({ \ 33 register unsigned int __base asm("r4") = base; \ 34 register unsigned long long __n asm("r0") = n; \ 35 register unsigned long long __res asm("r2"); \ 36 register unsigned int __rem asm(__xh); \ 37 asm( __asmeq("%0", __xh) \ 38 __asmeq("%1", "r2") \ 39 __asmeq("%2", "r0") \ 40 __asmeq("%3", "r4") \ 41 "bl __do_div64" \ 42 : "=r" (__rem), "=r" (__res) \ 43 : "r" (__n), "r" (__base) \ 44 : "ip", "lr", "cc"); \ 45 n = __res; \ 46 __rem; \ 47 }) 48 49 #if __GNUC__ < 4 || !defined(CONFIG_AEABI) 50 51 /* 52 * gcc versions earlier than 4.0 are simply too problematic for the 53 * optimized implementation below. First there is gcc PR 15089 that 54 * tend to trig on more complex constructs, spurious .global __udivsi3 55 * are inserted even if none of those symbols are referenced in the 56 * generated code, and those gcc versions are not able to do constant 57 * propagation on long long values anyway. 58 */ 59 #define do_div(n, base) __do_div_asm(n, base) 60 61 #elif __GNUC__ >= 4 62 63 #include <asm/bug.h> 64 65 /* 66 * If the divisor happens to be constant, we determine the appropriate 67 * inverse at compile time to turn the division into a few inline 68 * multiplications instead which is much faster. And yet only if compiling 69 * for ARMv4 or higher (we need umull/umlal) and if the gcc version is 70 * sufficiently recent to perform proper long long constant propagation. 71 * (It is unfortunate that gcc doesn't perform all this internally.) 72 */ 73 #define do_div(n, base) \ 74 ({ \ 75 unsigned int __r, __b = (base); \ 76 if (!__builtin_constant_p(__b) || __b == 0 || \ 77 (__LINUX_ARM_ARCH__ < 4 && (__b & (__b - 1)) != 0)) { \ 78 /* non-constant divisor (or zero): slow path */ \ 79 __r = __do_div_asm(n, __b); \ 80 } else if ((__b & (__b - 1)) == 0) { \ 81 /* Trivial: __b is constant and a power of 2 */ \ 82 /* gcc does the right thing with this code. */ \ 83 __r = n; \ 84 __r &= (__b - 1); \ 85 n /= __b; \ 86 } else { \ 87 /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \ 88 /* We rely on the fact that most of this code gets */ \ 89 /* optimized away at compile time due to constant */ \ 90 /* propagation and only a couple inline assembly */ \ 91 /* instructions should remain. Better avoid any */ \ 92 /* code construct that might prevent that. */ \ 93 unsigned long long __res, __x, __t, __m, __n = n; \ 94 unsigned int __c, __p, __z = 0; \ 95 /* preserve low part of n for reminder computation */ \ 96 __r = __n; \ 97 /* determine number of bits to represent __b */ \ 98 __p = 1 << __div64_fls(__b); \ 99 /* compute __m = ((__p << 64) + __b - 1) / __b */ \ 100 __m = (~0ULL / __b) * __p; \ 101 __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \ 102 /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \ 103 __x = ~0ULL / __b * __b - 1; \ 104 __res = (__m & 0xffffffff) * (__x & 0xffffffff); \ 105 __res >>= 32; \ 106 __res += (__m & 0xffffffff) * (__x >> 32); \ 107 __t = __res; \ 108 __res += (__x & 0xffffffff) * (__m >> 32); \ 109 __t = (__res < __t) ? (1ULL << 32) : 0; \ 110 __res = (__res >> 32) + __t; \ 111 __res += (__m >> 32) * (__x >> 32); \ 112 __res /= __p; \ 113 /* Now sanitize and optimize what we've got. */ \ 114 if (~0ULL % (__b / (__b & -__b)) == 0) { \ 115 /* those cases can be simplified with: */ \ 116 __n /= (__b & -__b); \ 117 __m = ~0ULL / (__b / (__b & -__b)); \ 118 __p = 1; \ 119 __c = 1; \ 120 } else if (__res != __x / __b) { \ 121 /* We can't get away without a correction */ \ 122 /* to compensate for bit truncation errors. */ \ 123 /* To avoid it we'd need an additional bit */ \ 124 /* to represent __m which would overflow it. */ \ 125 /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \ 126 __c = 1; \ 127 /* Compute __m = (__p << 64) / __b */ \ 128 __m = (~0ULL / __b) * __p; \ 129 __m += ((~0ULL % __b + 1) * __p) / __b; \ 130 } else { \ 131 /* Reduce __m/__p, and try to clear bit 31 */ \ 132 /* of __m when possible otherwise that'll */ \ 133 /* need extra overflow handling later. */ \ 134 unsigned int __bits = -(__m & -__m); \ 135 __bits |= __m >> 32; \ 136 __bits = (~__bits) << 1; \ 137 /* If __bits == 0 then setting bit 31 is */ \ 138 /* unavoidable. Simply apply the maximum */ \ 139 /* possible reduction in that case. */ \ 140 /* Otherwise the MSB of __bits indicates the */ \ 141 /* best reduction we should apply. */ \ 142 if (!__bits) { \ 143 __p /= (__m & -__m); \ 144 __m /= (__m & -__m); \ 145 } else { \ 146 __p >>= __div64_fls(__bits); \ 147 __m >>= __div64_fls(__bits); \ 148 } \ 149 /* No correction needed. */ \ 150 __c = 0; \ 151 } \ 152 /* Now we have a combination of 2 conditions: */ \ 153 /* 1) whether or not we need a correction (__c), and */ \ 154 /* 2) whether or not there might be an overflow in */ \ 155 /* the cross product (__m & ((1<<63) | (1<<31))) */ \ 156 /* Select the best insn combination to perform the */ \ 157 /* actual __m * __n / (__p << 64) operation. */ \ 158 if (!__c) { \ 159 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ 160 "mov %Q0, #0" \ 161 : "=&r" (__res) \ 162 : "r" (__m), "r" (__n) \ 163 : "cc" ); \ 164 } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ 165 __res = __m; \ 166 asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \ 167 "mov %Q0, #0" \ 168 : "+&r" (__res) \ 169 : "r" (__m), "r" (__n) \ 170 : "cc" ); \ 171 } else { \ 172 asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ 173 "cmn %Q0, %Q1\n\t" \ 174 "adcs %R0, %R0, %R1\n\t" \ 175 "adc %Q0, %3, #0" \ 176 : "=&r" (__res) \ 177 : "r" (__m), "r" (__n), "r" (__z) \ 178 : "cc" ); \ 179 } \ 180 if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ 181 asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \ 182 "umlal %R0, %Q0, %Q1, %R2\n\t" \ 183 "mov %R0, #0\n\t" \ 184 "umlal %Q0, %R0, %R1, %R2" \ 185 : "+&r" (__res) \ 186 : "r" (__m), "r" (__n) \ 187 : "cc" ); \ 188 } else { \ 189 asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \ 190 "umlal %R0, %1, %Q2, %R3\n\t" \ 191 "mov %R0, #0\n\t" \ 192 "adds %Q0, %1, %Q0\n\t" \ 193 "adc %R0, %R0, #0\n\t" \ 194 "umlal %Q0, %R0, %R2, %R3" \ 195 : "+&r" (__res), "+&r" (__z) \ 196 : "r" (__m), "r" (__n) \ 197 : "cc" ); \ 198 } \ 199 __res /= __p; \ 200 /* The reminder can be computed with 32-bit regs */ \ 201 /* only, and gcc is good at that. */ \ 202 { \ 203 unsigned int __res0 = __res; \ 204 unsigned int __b0 = __b; \ 205 __r -= __res0 * __b0; \ 206 } \ 207 /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \ 208 n = __res; \ 209 } \ 210 __r; \ 211 }) 212 213 /* our own fls implementation to make sure constant propagation is fine */ 214 #define __div64_fls(bits) \ 215 ({ \ 216 unsigned int __left = (bits), __nr = 0; \ 217 if (__left & 0xffff0000) __nr += 16, __left >>= 16; \ 218 if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \ 219 if (__left & 0x000000f0) __nr += 4, __left >>= 4; \ 220 if (__left & 0x0000000c) __nr += 2, __left >>= 2; \ 221 if (__left & 0x00000002) __nr += 1; \ 222 __nr; \ 223 }) 224 225 #endif 226 227 #endif 228