1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15#include <asm/export.h> 16 17#define COPY_16_BYTES \ 18 lwz r7,4(r4); \ 19 lwz r8,8(r4); \ 20 lwz r9,12(r4); \ 21 lwzu r10,16(r4); \ 22 stw r7,4(r6); \ 23 stw r8,8(r6); \ 24 stw r9,12(r6); \ 25 stwu r10,16(r6) 26 27#define COPY_16_BYTES_WITHEX(n) \ 288 ## n ## 0: \ 29 lwz r7,4(r4); \ 308 ## n ## 1: \ 31 lwz r8,8(r4); \ 328 ## n ## 2: \ 33 lwz r9,12(r4); \ 348 ## n ## 3: \ 35 lwzu r10,16(r4); \ 368 ## n ## 4: \ 37 stw r7,4(r6); \ 388 ## n ## 5: \ 39 stw r8,8(r6); \ 408 ## n ## 6: \ 41 stw r9,12(r6); \ 428 ## n ## 7: \ 43 stwu r10,16(r6) 44 45#define COPY_16_BYTES_EXCODE(n) \ 469 ## n ## 0: \ 47 addi r5,r5,-(16 * n); \ 48 b 104f; \ 499 ## n ## 1: \ 50 addi r5,r5,-(16 * n); \ 51 b 105f; \ 52.section __ex_table,"a"; \ 53 .align 2; \ 54 .long 8 ## n ## 0b,9 ## n ## 0b; \ 55 .long 8 ## n ## 1b,9 ## n ## 0b; \ 56 .long 8 ## n ## 2b,9 ## n ## 0b; \ 57 .long 8 ## n ## 3b,9 ## n ## 0b; \ 58 .long 8 ## n ## 4b,9 ## n ## 1b; \ 59 .long 8 ## n ## 5b,9 ## n ## 1b; \ 60 .long 8 ## n ## 6b,9 ## n ## 1b; \ 61 .long 8 ## n ## 7b,9 ## n ## 1b; \ 62 .text 63 64 .text 65 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 66 .stabs "copy_32.S",N_SO,0,0,0f 670: 68 69CACHELINE_BYTES = L1_CACHE_BYTES 70LG_CACHELINE_BYTES = L1_CACHE_SHIFT 71CACHELINE_MASK = (L1_CACHE_BYTES-1) 72 73/* 74 * Use dcbz on the complete cache lines in the destination 75 * to set them to zero. This requires that the destination 76 * area is cacheable. -- paulus 77 * 78 * During early init, cache might not be active yet, so dcbz cannot be used. 79 * We therefore skip the optimised bloc that uses dcbz. This jump is 80 * replaced by a nop once cache is active. This is done in machine_init() 81 */ 82_GLOBAL(memset) 83 rlwimi r4,r4,8,16,23 84 rlwimi r4,r4,16,0,15 85 86 addi r6,r3,-4 87 cmplwi 0,r5,4 88 blt 7f 89 stwu r4,4(r6) 90 beqlr 91 andi. r0,r6,3 92 add r5,r0,r5 93 subf r6,r0,r6 94 cmplwi 0,r4,0 95 bne 2f /* Use normal procedure if r4 is not zero */ 96EXPORT_SYMBOL(memset) 97_GLOBAL(memset_nocache_branch) 98 b 2f /* Skip optimised bloc until cache is enabled */ 99 100 clrlwi r7,r6,32-LG_CACHELINE_BYTES 101 add r8,r7,r5 102 srwi r9,r8,LG_CACHELINE_BYTES 103 addic. r9,r9,-1 /* total number of complete cachelines */ 104 ble 2f 105 xori r0,r7,CACHELINE_MASK & ~3 106 srwi. r0,r0,2 107 beq 3f 108 mtctr r0 1094: stwu r4,4(r6) 110 bdnz 4b 1113: mtctr r9 112 li r7,4 11310: dcbz r7,r6 114 addi r6,r6,CACHELINE_BYTES 115 bdnz 10b 116 clrlwi r5,r8,32-LG_CACHELINE_BYTES 117 addi r5,r5,4 118 1192: srwi r0,r5,2 120 mtctr r0 121 bdz 6f 1221: stwu r4,4(r6) 123 bdnz 1b 1246: andi. r5,r5,3 1257: cmpwi 0,r5,0 126 beqlr 127 mtctr r5 128 addi r6,r6,3 1298: stbu r4,1(r6) 130 bdnz 8b 131 blr 132 133/* 134 * This version uses dcbz on the complete cache lines in the 135 * destination area to reduce memory traffic. This requires that 136 * the destination area is cacheable. 137 * We only use this version if the source and dest don't overlap. 138 * -- paulus. 139 * 140 * During early init, cache might not be active yet, so dcbz cannot be used. 141 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is 142 * replaced by a nop once cache is active. This is done in machine_init() 143 */ 144_GLOBAL(memmove) 145 cmplw 0,r3,r4 146 bgt backwards_memcpy 147 /* fall through */ 148 149_GLOBAL(memcpy) 150 b generic_memcpy 151 add r7,r3,r5 /* test if the src & dst overlap */ 152 add r8,r4,r5 153 cmplw 0,r4,r7 154 cmplw 1,r3,r8 155 crand 0,0,4 /* cr0.lt &= cr1.lt */ 156 blt generic_memcpy /* if regions overlap */ 157 158 addi r4,r4,-4 159 addi r6,r3,-4 160 neg r0,r3 161 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 162 beq 58f 163 164 cmplw 0,r5,r0 /* is this more than total to do? */ 165 blt 63f /* if not much to do */ 166 andi. r8,r0,3 /* get it word-aligned first */ 167 subf r5,r0,r5 168 mtctr r8 169 beq+ 61f 17070: lbz r9,4(r4) /* do some bytes */ 171 addi r4,r4,1 172 addi r6,r6,1 173 stb r9,3(r6) 174 bdnz 70b 17561: srwi. r0,r0,2 176 mtctr r0 177 beq 58f 17872: lwzu r9,4(r4) /* do some words */ 179 stwu r9,4(r6) 180 bdnz 72b 181 18258: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 183 clrlwi r5,r5,32-LG_CACHELINE_BYTES 184 li r11,4 185 mtctr r0 186 beq 63f 18753: 188 dcbz r11,r6 189 COPY_16_BYTES 190#if L1_CACHE_BYTES >= 32 191 COPY_16_BYTES 192#if L1_CACHE_BYTES >= 64 193 COPY_16_BYTES 194 COPY_16_BYTES 195#if L1_CACHE_BYTES >= 128 196 COPY_16_BYTES 197 COPY_16_BYTES 198 COPY_16_BYTES 199 COPY_16_BYTES 200#endif 201#endif 202#endif 203 bdnz 53b 204 20563: srwi. r0,r5,2 206 mtctr r0 207 beq 64f 20830: lwzu r0,4(r4) 209 stwu r0,4(r6) 210 bdnz 30b 211 21264: andi. r0,r5,3 213 mtctr r0 214 beq+ 65f 215 addi r4,r4,3 216 addi r6,r6,3 21740: lbzu r0,1(r4) 218 stbu r0,1(r6) 219 bdnz 40b 22065: blr 221EXPORT_SYMBOL(memcpy) 222EXPORT_SYMBOL(memmove) 223 224generic_memcpy: 225 srwi. r7,r5,3 226 addi r6,r3,-4 227 addi r4,r4,-4 228 beq 2f /* if less than 8 bytes to do */ 229 andi. r0,r6,3 /* get dest word aligned */ 230 mtctr r7 231 bne 5f 2321: lwz r7,4(r4) 233 lwzu r8,8(r4) 234 stw r7,4(r6) 235 stwu r8,8(r6) 236 bdnz 1b 237 andi. r5,r5,7 2382: cmplwi 0,r5,4 239 blt 3f 240 lwzu r0,4(r4) 241 addi r5,r5,-4 242 stwu r0,4(r6) 2433: cmpwi 0,r5,0 244 beqlr 245 mtctr r5 246 addi r4,r4,3 247 addi r6,r6,3 2484: lbzu r0,1(r4) 249 stbu r0,1(r6) 250 bdnz 4b 251 blr 2525: subfic r0,r0,4 253 mtctr r0 2546: lbz r7,4(r4) 255 addi r4,r4,1 256 stb r7,4(r6) 257 addi r6,r6,1 258 bdnz 6b 259 subf r5,r0,r5 260 rlwinm. r7,r5,32-3,3,31 261 beq 2b 262 mtctr r7 263 b 1b 264 265_GLOBAL(backwards_memcpy) 266 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 267 add r6,r3,r5 268 add r4,r4,r5 269 beq 2f 270 andi. r0,r6,3 271 mtctr r7 272 bne 5f 2731: lwz r7,-4(r4) 274 lwzu r8,-8(r4) 275 stw r7,-4(r6) 276 stwu r8,-8(r6) 277 bdnz 1b 278 andi. r5,r5,7 2792: cmplwi 0,r5,4 280 blt 3f 281 lwzu r0,-4(r4) 282 subi r5,r5,4 283 stwu r0,-4(r6) 2843: cmpwi 0,r5,0 285 beqlr 286 mtctr r5 2874: lbzu r0,-1(r4) 288 stbu r0,-1(r6) 289 bdnz 4b 290 blr 2915: mtctr r0 2926: lbzu r7,-1(r4) 293 stbu r7,-1(r6) 294 bdnz 6b 295 subf r5,r0,r5 296 rlwinm. r7,r5,32-3,3,31 297 beq 2b 298 mtctr r7 299 b 1b 300 301_GLOBAL(__copy_tofrom_user) 302 addi r4,r4,-4 303 addi r6,r3,-4 304 neg r0,r3 305 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 306 beq 58f 307 308 cmplw 0,r5,r0 /* is this more than total to do? */ 309 blt 63f /* if not much to do */ 310 andi. r8,r0,3 /* get it word-aligned first */ 311 mtctr r8 312 beq+ 61f 31370: lbz r9,4(r4) /* do some bytes */ 31471: stb r9,4(r6) 315 addi r4,r4,1 316 addi r6,r6,1 317 bdnz 70b 31861: subf r5,r0,r5 319 srwi. r0,r0,2 320 mtctr r0 321 beq 58f 32272: lwzu r9,4(r4) /* do some words */ 32373: stwu r9,4(r6) 324 bdnz 72b 325 326 .section __ex_table,"a" 327 .align 2 328 .long 70b,100f 329 .long 71b,101f 330 .long 72b,102f 331 .long 73b,103f 332 .text 333 33458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 335 clrlwi r5,r5,32-LG_CACHELINE_BYTES 336 li r11,4 337 beq 63f 338 339 /* Here we decide how far ahead to prefetch the source */ 340 li r3,4 341 cmpwi r0,1 342 li r7,0 343 ble 114f 344 li r7,1 345#if MAX_COPY_PREFETCH > 1 346 /* Heuristically, for large transfers we prefetch 347 MAX_COPY_PREFETCH cachelines ahead. For small transfers 348 we prefetch 1 cacheline ahead. */ 349 cmpwi r0,MAX_COPY_PREFETCH 350 ble 112f 351 li r7,MAX_COPY_PREFETCH 352112: mtctr r7 353111: dcbt r3,r4 354 addi r3,r3,CACHELINE_BYTES 355 bdnz 111b 356#else 357 dcbt r3,r4 358 addi r3,r3,CACHELINE_BYTES 359#endif /* MAX_COPY_PREFETCH > 1 */ 360 361114: subf r8,r7,r0 362 mr r0,r7 363 mtctr r8 364 36553: dcbt r3,r4 36654: dcbz r11,r6 367 .section __ex_table,"a" 368 .align 2 369 .long 54b,105f 370 .text 371/* the main body of the cacheline loop */ 372 COPY_16_BYTES_WITHEX(0) 373#if L1_CACHE_BYTES >= 32 374 COPY_16_BYTES_WITHEX(1) 375#if L1_CACHE_BYTES >= 64 376 COPY_16_BYTES_WITHEX(2) 377 COPY_16_BYTES_WITHEX(3) 378#if L1_CACHE_BYTES >= 128 379 COPY_16_BYTES_WITHEX(4) 380 COPY_16_BYTES_WITHEX(5) 381 COPY_16_BYTES_WITHEX(6) 382 COPY_16_BYTES_WITHEX(7) 383#endif 384#endif 385#endif 386 bdnz 53b 387 cmpwi r0,0 388 li r3,4 389 li r7,0 390 bne 114b 391 39263: srwi. r0,r5,2 393 mtctr r0 394 beq 64f 39530: lwzu r0,4(r4) 39631: stwu r0,4(r6) 397 bdnz 30b 398 39964: andi. r0,r5,3 400 mtctr r0 401 beq+ 65f 40240: lbz r0,4(r4) 40341: stb r0,4(r6) 404 addi r4,r4,1 405 addi r6,r6,1 406 bdnz 40b 40765: li r3,0 408 blr 409 410/* read fault, initial single-byte copy */ 411100: li r9,0 412 b 90f 413/* write fault, initial single-byte copy */ 414101: li r9,1 41590: subf r5,r8,r5 416 li r3,0 417 b 99f 418/* read fault, initial word copy */ 419102: li r9,0 420 b 91f 421/* write fault, initial word copy */ 422103: li r9,1 42391: li r3,2 424 b 99f 425 426/* 427 * this stuff handles faults in the cacheline loop and branches to either 428 * 104f (if in read part) or 105f (if in write part), after updating r5 429 */ 430 COPY_16_BYTES_EXCODE(0) 431#if L1_CACHE_BYTES >= 32 432 COPY_16_BYTES_EXCODE(1) 433#if L1_CACHE_BYTES >= 64 434 COPY_16_BYTES_EXCODE(2) 435 COPY_16_BYTES_EXCODE(3) 436#if L1_CACHE_BYTES >= 128 437 COPY_16_BYTES_EXCODE(4) 438 COPY_16_BYTES_EXCODE(5) 439 COPY_16_BYTES_EXCODE(6) 440 COPY_16_BYTES_EXCODE(7) 441#endif 442#endif 443#endif 444 445/* read fault in cacheline loop */ 446104: li r9,0 447 b 92f 448/* fault on dcbz (effectively a write fault) */ 449/* or write fault in cacheline loop */ 450105: li r9,1 45192: li r3,LG_CACHELINE_BYTES 452 mfctr r8 453 add r0,r0,r8 454 b 106f 455/* read fault in final word loop */ 456108: li r9,0 457 b 93f 458/* write fault in final word loop */ 459109: li r9,1 46093: andi. r5,r5,3 461 li r3,2 462 b 99f 463/* read fault in final byte loop */ 464110: li r9,0 465 b 94f 466/* write fault in final byte loop */ 467111: li r9,1 46894: li r5,0 469 li r3,0 470/* 471 * At this stage the number of bytes not copied is 472 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 473 */ 47499: mfctr r0 475106: slw r3,r0,r3 476 add. r3,r3,r5 477 beq 120f /* shouldn't happen */ 478 cmpwi 0,r9,0 479 bne 120f 480/* for a read fault, first try to continue the copy one byte at a time */ 481 mtctr r3 482130: lbz r0,4(r4) 483131: stb r0,4(r6) 484 addi r4,r4,1 485 addi r6,r6,1 486 bdnz 130b 487/* then clear out the destination: r3 bytes starting at 4(r6) */ 488132: mfctr r3 489 srwi. r0,r3,2 490 li r9,0 491 mtctr r0 492 beq 113f 493112: stwu r9,4(r6) 494 bdnz 112b 495113: andi. r0,r3,3 496 mtctr r0 497 beq 120f 498114: stb r9,4(r6) 499 addi r6,r6,1 500 bdnz 114b 501120: blr 502 503 .section __ex_table,"a" 504 .align 2 505 .long 30b,108b 506 .long 31b,109b 507 .long 40b,110b 508 .long 41b,111b 509 .long 130b,132b 510 .long 131b,120b 511 .long 112b,120b 512 .long 114b,120b 513 .text 514EXPORT_SYMBOL(__copy_tofrom_user) 515