1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15 16#define COPY_16_BYTES \ 17 lwz r7,4(r4); \ 18 lwz r8,8(r4); \ 19 lwz r9,12(r4); \ 20 lwzu r10,16(r4); \ 21 stw r7,4(r6); \ 22 stw r8,8(r6); \ 23 stw r9,12(r6); \ 24 stwu r10,16(r6) 25 26#define COPY_16_BYTES_WITHEX(n) \ 278 ## n ## 0: \ 28 lwz r7,4(r4); \ 298 ## n ## 1: \ 30 lwz r8,8(r4); \ 318 ## n ## 2: \ 32 lwz r9,12(r4); \ 338 ## n ## 3: \ 34 lwzu r10,16(r4); \ 358 ## n ## 4: \ 36 stw r7,4(r6); \ 378 ## n ## 5: \ 38 stw r8,8(r6); \ 398 ## n ## 6: \ 40 stw r9,12(r6); \ 418 ## n ## 7: \ 42 stwu r10,16(r6) 43 44#define COPY_16_BYTES_EXCODE(n) \ 459 ## n ## 0: \ 46 addi r5,r5,-(16 * n); \ 47 b 104f; \ 489 ## n ## 1: \ 49 addi r5,r5,-(16 * n); \ 50 b 105f; \ 51.section __ex_table,"a"; \ 52 .align 2; \ 53 .long 8 ## n ## 0b,9 ## n ## 0b; \ 54 .long 8 ## n ## 1b,9 ## n ## 0b; \ 55 .long 8 ## n ## 2b,9 ## n ## 0b; \ 56 .long 8 ## n ## 3b,9 ## n ## 0b; \ 57 .long 8 ## n ## 4b,9 ## n ## 1b; \ 58 .long 8 ## n ## 5b,9 ## n ## 1b; \ 59 .long 8 ## n ## 6b,9 ## n ## 1b; \ 60 .long 8 ## n ## 7b,9 ## n ## 1b; \ 61 .text 62 63 .text 64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 65 .stabs "copy_32.S",N_SO,0,0,0f 660: 67 68CACHELINE_BYTES = L1_CACHE_BYTES 69LG_CACHELINE_BYTES = L1_CACHE_SHIFT 70CACHELINE_MASK = (L1_CACHE_BYTES-1) 71 72_GLOBAL(memset) 73 rlwimi r4,r4,8,16,23 74 rlwimi r4,r4,16,0,15 75 addi r6,r3,-4 76 cmplwi 0,r5,4 77 blt 7f 78 stwu r4,4(r6) 79 beqlr 80 andi. r0,r6,3 81 add r5,r0,r5 82 subf r6,r0,r6 83 srwi r0,r5,2 84 mtctr r0 85 bdz 6f 861: stwu r4,4(r6) 87 bdnz 1b 886: andi. r5,r5,3 897: cmpwi 0,r5,0 90 beqlr 91 mtctr r5 92 addi r6,r6,3 938: stbu r4,1(r6) 94 bdnz 8b 95 blr 96 97_GLOBAL(memmove) 98 cmplw 0,r3,r4 99 bgt backwards_memcpy 100 /* fall through */ 101 102_GLOBAL(memcpy) 103 srwi. r7,r5,3 104 addi r6,r3,-4 105 addi r4,r4,-4 106 beq 2f /* if less than 8 bytes to do */ 107 andi. r0,r6,3 /* get dest word aligned */ 108 mtctr r7 109 bne 5f 1101: lwz r7,4(r4) 111 lwzu r8,8(r4) 112 stw r7,4(r6) 113 stwu r8,8(r6) 114 bdnz 1b 115 andi. r5,r5,7 1162: cmplwi 0,r5,4 117 blt 3f 118 lwzu r0,4(r4) 119 addi r5,r5,-4 120 stwu r0,4(r6) 1213: cmpwi 0,r5,0 122 beqlr 123 mtctr r5 124 addi r4,r4,3 125 addi r6,r6,3 1264: lbzu r0,1(r4) 127 stbu r0,1(r6) 128 bdnz 4b 129 blr 1305: subfic r0,r0,4 131 mtctr r0 1326: lbz r7,4(r4) 133 addi r4,r4,1 134 stb r7,4(r6) 135 addi r6,r6,1 136 bdnz 6b 137 subf r5,r0,r5 138 rlwinm. r7,r5,32-3,3,31 139 beq 2b 140 mtctr r7 141 b 1b 142 143_GLOBAL(backwards_memcpy) 144 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 145 add r6,r3,r5 146 add r4,r4,r5 147 beq 2f 148 andi. r0,r6,3 149 mtctr r7 150 bne 5f 1511: lwz r7,-4(r4) 152 lwzu r8,-8(r4) 153 stw r7,-4(r6) 154 stwu r8,-8(r6) 155 bdnz 1b 156 andi. r5,r5,7 1572: cmplwi 0,r5,4 158 blt 3f 159 lwzu r0,-4(r4) 160 subi r5,r5,4 161 stwu r0,-4(r6) 1623: cmpwi 0,r5,0 163 beqlr 164 mtctr r5 1654: lbzu r0,-1(r4) 166 stbu r0,-1(r6) 167 bdnz 4b 168 blr 1695: mtctr r0 1706: lbzu r7,-1(r4) 171 stbu r7,-1(r6) 172 bdnz 6b 173 subf r5,r0,r5 174 rlwinm. r7,r5,32-3,3,31 175 beq 2b 176 mtctr r7 177 b 1b 178 179_GLOBAL(__copy_tofrom_user) 180 addi r4,r4,-4 181 addi r6,r3,-4 182 neg r0,r3 183 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 184 beq 58f 185 186 cmplw 0,r5,r0 /* is this more than total to do? */ 187 blt 63f /* if not much to do */ 188 andi. r8,r0,3 /* get it word-aligned first */ 189 mtctr r8 190 beq+ 61f 19170: lbz r9,4(r4) /* do some bytes */ 19271: stb r9,4(r6) 193 addi r4,r4,1 194 addi r6,r6,1 195 bdnz 70b 19661: subf r5,r0,r5 197 srwi. r0,r0,2 198 mtctr r0 199 beq 58f 20072: lwzu r9,4(r4) /* do some words */ 20173: stwu r9,4(r6) 202 bdnz 72b 203 204 .section __ex_table,"a" 205 .align 2 206 .long 70b,100f 207 .long 71b,101f 208 .long 72b,102f 209 .long 73b,103f 210 .text 211 21258: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 213 clrlwi r5,r5,32-LG_CACHELINE_BYTES 214 li r11,4 215 beq 63f 216 217 /* Here we decide how far ahead to prefetch the source */ 218 li r3,4 219 cmpwi r0,1 220 li r7,0 221 ble 114f 222 li r7,1 223#if MAX_COPY_PREFETCH > 1 224 /* Heuristically, for large transfers we prefetch 225 MAX_COPY_PREFETCH cachelines ahead. For small transfers 226 we prefetch 1 cacheline ahead. */ 227 cmpwi r0,MAX_COPY_PREFETCH 228 ble 112f 229 li r7,MAX_COPY_PREFETCH 230112: mtctr r7 231111: dcbt r3,r4 232 addi r3,r3,CACHELINE_BYTES 233 bdnz 111b 234#else 235 dcbt r3,r4 236 addi r3,r3,CACHELINE_BYTES 237#endif /* MAX_COPY_PREFETCH > 1 */ 238 239114: subf r8,r7,r0 240 mr r0,r7 241 mtctr r8 242 24353: dcbt r3,r4 24454: dcbz r11,r6 245 .section __ex_table,"a" 246 .align 2 247 .long 54b,105f 248 .text 249/* the main body of the cacheline loop */ 250 COPY_16_BYTES_WITHEX(0) 251#if L1_CACHE_BYTES >= 32 252 COPY_16_BYTES_WITHEX(1) 253#if L1_CACHE_BYTES >= 64 254 COPY_16_BYTES_WITHEX(2) 255 COPY_16_BYTES_WITHEX(3) 256#if L1_CACHE_BYTES >= 128 257 COPY_16_BYTES_WITHEX(4) 258 COPY_16_BYTES_WITHEX(5) 259 COPY_16_BYTES_WITHEX(6) 260 COPY_16_BYTES_WITHEX(7) 261#endif 262#endif 263#endif 264 bdnz 53b 265 cmpwi r0,0 266 li r3,4 267 li r7,0 268 bne 114b 269 27063: srwi. r0,r5,2 271 mtctr r0 272 beq 64f 27330: lwzu r0,4(r4) 27431: stwu r0,4(r6) 275 bdnz 30b 276 27764: andi. r0,r5,3 278 mtctr r0 279 beq+ 65f 28040: lbz r0,4(r4) 28141: stb r0,4(r6) 282 addi r4,r4,1 283 addi r6,r6,1 284 bdnz 40b 28565: li r3,0 286 blr 287 288/* read fault, initial single-byte copy */ 289100: li r9,0 290 b 90f 291/* write fault, initial single-byte copy */ 292101: li r9,1 29390: subf r5,r8,r5 294 li r3,0 295 b 99f 296/* read fault, initial word copy */ 297102: li r9,0 298 b 91f 299/* write fault, initial word copy */ 300103: li r9,1 30191: li r3,2 302 b 99f 303 304/* 305 * this stuff handles faults in the cacheline loop and branches to either 306 * 104f (if in read part) or 105f (if in write part), after updating r5 307 */ 308 COPY_16_BYTES_EXCODE(0) 309#if L1_CACHE_BYTES >= 32 310 COPY_16_BYTES_EXCODE(1) 311#if L1_CACHE_BYTES >= 64 312 COPY_16_BYTES_EXCODE(2) 313 COPY_16_BYTES_EXCODE(3) 314#if L1_CACHE_BYTES >= 128 315 COPY_16_BYTES_EXCODE(4) 316 COPY_16_BYTES_EXCODE(5) 317 COPY_16_BYTES_EXCODE(6) 318 COPY_16_BYTES_EXCODE(7) 319#endif 320#endif 321#endif 322 323/* read fault in cacheline loop */ 324104: li r9,0 325 b 92f 326/* fault on dcbz (effectively a write fault) */ 327/* or write fault in cacheline loop */ 328105: li r9,1 32992: li r3,LG_CACHELINE_BYTES 330 mfctr r8 331 add r0,r0,r8 332 b 106f 333/* read fault in final word loop */ 334108: li r9,0 335 b 93f 336/* write fault in final word loop */ 337109: li r9,1 33893: andi. r5,r5,3 339 li r3,2 340 b 99f 341/* read fault in final byte loop */ 342110: li r9,0 343 b 94f 344/* write fault in final byte loop */ 345111: li r9,1 34694: li r5,0 347 li r3,0 348/* 349 * At this stage the number of bytes not copied is 350 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 351 */ 35299: mfctr r0 353106: slw r3,r0,r3 354 add. r3,r3,r5 355 beq 120f /* shouldn't happen */ 356 cmpwi 0,r9,0 357 bne 120f 358/* for a read fault, first try to continue the copy one byte at a time */ 359 mtctr r3 360130: lbz r0,4(r4) 361131: stb r0,4(r6) 362 addi r4,r4,1 363 addi r6,r6,1 364 bdnz 130b 365/* then clear out the destination: r3 bytes starting at 4(r6) */ 366132: mfctr r3 367 srwi. r0,r3,2 368 li r9,0 369 mtctr r0 370 beq 113f 371112: stwu r9,4(r6) 372 bdnz 112b 373113: andi. r0,r3,3 374 mtctr r0 375 beq 120f 376114: stb r9,4(r6) 377 addi r6,r6,1 378 bdnz 114b 379120: blr 380 381 .section __ex_table,"a" 382 .align 2 383 .long 30b,108b 384 .long 31b,109b 385 .long 40b,110b 386 .long 41b,111b 387 .long 130b,132b 388 .long 131b,120b 389 .long 112b,120b 390 .long 114b,120b 391 .text 392