1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <linux/config.h> 12#include <asm/processor.h> 13#include <asm/cache.h> 14#include <asm/errno.h> 15#include <asm/ppc_asm.h> 16 17#define COPY_16_BYTES \ 18 lwz r7,4(r4); \ 19 lwz r8,8(r4); \ 20 lwz r9,12(r4); \ 21 lwzu r10,16(r4); \ 22 stw r7,4(r6); \ 23 stw r8,8(r6); \ 24 stw r9,12(r6); \ 25 stwu r10,16(r6) 26 27#define COPY_16_BYTES_WITHEX(n) \ 288 ## n ## 0: \ 29 lwz r7,4(r4); \ 308 ## n ## 1: \ 31 lwz r8,8(r4); \ 328 ## n ## 2: \ 33 lwz r9,12(r4); \ 348 ## n ## 3: \ 35 lwzu r10,16(r4); \ 368 ## n ## 4: \ 37 stw r7,4(r6); \ 388 ## n ## 5: \ 39 stw r8,8(r6); \ 408 ## n ## 6: \ 41 stw r9,12(r6); \ 428 ## n ## 7: \ 43 stwu r10,16(r6) 44 45#define COPY_16_BYTES_EXCODE(n) \ 469 ## n ## 0: \ 47 addi r5,r5,-(16 * n); \ 48 b 104f; \ 499 ## n ## 1: \ 50 addi r5,r5,-(16 * n); \ 51 b 105f; \ 52.section __ex_table,"a"; \ 53 .align 2; \ 54 .long 8 ## n ## 0b,9 ## n ## 0b; \ 55 .long 8 ## n ## 1b,9 ## n ## 0b; \ 56 .long 8 ## n ## 2b,9 ## n ## 0b; \ 57 .long 8 ## n ## 3b,9 ## n ## 0b; \ 58 .long 8 ## n ## 4b,9 ## n ## 1b; \ 59 .long 8 ## n ## 5b,9 ## n ## 1b; \ 60 .long 8 ## n ## 6b,9 ## n ## 1b; \ 61 .long 8 ## n ## 7b,9 ## n ## 1b; \ 62 .text 63 64 .text 65 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 66 .stabs "copy32.S",N_SO,0,0,0f 670: 68 69CACHELINE_BYTES = L1_CACHE_BYTES 70LG_CACHELINE_BYTES = L1_CACHE_SHIFT 71CACHELINE_MASK = (L1_CACHE_BYTES-1) 72 73/* 74 * Use dcbz on the complete cache lines in the destination 75 * to set them to zero. This requires that the destination 76 * area is cacheable. -- paulus 77 */ 78_GLOBAL(cacheable_memzero) 79 mr r5,r4 80 li r4,0 81 addi r6,r3,-4 82 cmplwi 0,r5,4 83 blt 7f 84 stwu r4,4(r6) 85 beqlr 86 andi. r0,r6,3 87 add r5,r0,r5 88 subf r6,r0,r6 89 clrlwi r7,r6,32-LG_CACHELINE_BYTES 90 add r8,r7,r5 91 srwi r9,r8,LG_CACHELINE_BYTES 92 addic. r9,r9,-1 /* total number of complete cachelines */ 93 ble 2f 94 xori r0,r7,CACHELINE_MASK & ~3 95 srwi. r0,r0,2 96 beq 3f 97 mtctr r0 984: stwu r4,4(r6) 99 bdnz 4b 1003: mtctr r9 101 li r7,4 102#if !defined(CONFIG_8xx) 10310: dcbz r7,r6 104#else 10510: stw r4, 4(r6) 106 stw r4, 8(r6) 107 stw r4, 12(r6) 108 stw r4, 16(r6) 109#if CACHE_LINE_SIZE >= 32 110 stw r4, 20(r6) 111 stw r4, 24(r6) 112 stw r4, 28(r6) 113 stw r4, 32(r6) 114#endif /* CACHE_LINE_SIZE */ 115#endif 116 addi r6,r6,CACHELINE_BYTES 117 bdnz 10b 118 clrlwi r5,r8,32-LG_CACHELINE_BYTES 119 addi r5,r5,4 1202: srwi r0,r5,2 121 mtctr r0 122 bdz 6f 1231: stwu r4,4(r6) 124 bdnz 1b 1256: andi. r5,r5,3 1267: cmpwi 0,r5,0 127 beqlr 128 mtctr r5 129 addi r6,r6,3 1308: stbu r4,1(r6) 131 bdnz 8b 132 blr 133 134_GLOBAL(memset) 135 rlwimi r4,r4,8,16,23 136 rlwimi r4,r4,16,0,15 137 addi r6,r3,-4 138 cmplwi 0,r5,4 139 blt 7f 140 stwu r4,4(r6) 141 beqlr 142 andi. r0,r6,3 143 add r5,r0,r5 144 subf r6,r0,r6 145 srwi r0,r5,2 146 mtctr r0 147 bdz 6f 1481: stwu r4,4(r6) 149 bdnz 1b 1506: andi. r5,r5,3 1517: cmpwi 0,r5,0 152 beqlr 153 mtctr r5 154 addi r6,r6,3 1558: stbu r4,1(r6) 156 bdnz 8b 157 blr 158 159/* 160 * This version uses dcbz on the complete cache lines in the 161 * destination area to reduce memory traffic. This requires that 162 * the destination area is cacheable. 163 * We only use this version if the source and dest don't overlap. 164 * -- paulus. 165 */ 166_GLOBAL(cacheable_memcpy) 167 add r7,r3,r5 /* test if the src & dst overlap */ 168 add r8,r4,r5 169 cmplw 0,r4,r7 170 cmplw 1,r3,r8 171 crand 0,0,4 /* cr0.lt &= cr1.lt */ 172 blt memcpy /* if regions overlap */ 173 174 addi r4,r4,-4 175 addi r6,r3,-4 176 neg r0,r3 177 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 178 beq 58f 179 180 cmplw 0,r5,r0 /* is this more than total to do? */ 181 blt 63f /* if not much to do */ 182 andi. r8,r0,3 /* get it word-aligned first */ 183 subf r5,r0,r5 184 mtctr r8 185 beq+ 61f 18670: lbz r9,4(r4) /* do some bytes */ 187 stb r9,4(r6) 188 addi r4,r4,1 189 addi r6,r6,1 190 bdnz 70b 19161: srwi. r0,r0,2 192 mtctr r0 193 beq 58f 19472: lwzu r9,4(r4) /* do some words */ 195 stwu r9,4(r6) 196 bdnz 72b 197 19858: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 199 clrlwi r5,r5,32-LG_CACHELINE_BYTES 200 li r11,4 201 mtctr r0 202 beq 63f 20353: 204#if !defined(CONFIG_8xx) 205 dcbz r11,r6 206#endif 207 COPY_16_BYTES 208#if L1_CACHE_BYTES >= 32 209 COPY_16_BYTES 210#if L1_CACHE_BYTES >= 64 211 COPY_16_BYTES 212 COPY_16_BYTES 213#if L1_CACHE_BYTES >= 128 214 COPY_16_BYTES 215 COPY_16_BYTES 216 COPY_16_BYTES 217 COPY_16_BYTES 218#endif 219#endif 220#endif 221 bdnz 53b 222 22363: srwi. r0,r5,2 224 mtctr r0 225 beq 64f 22630: lwzu r0,4(r4) 227 stwu r0,4(r6) 228 bdnz 30b 229 23064: andi. r0,r5,3 231 mtctr r0 232 beq+ 65f 23340: lbz r0,4(r4) 234 stb r0,4(r6) 235 addi r4,r4,1 236 addi r6,r6,1 237 bdnz 40b 23865: blr 239 240_GLOBAL(memmove) 241 cmplw 0,r3,r4 242 bgt backwards_memcpy 243 /* fall through */ 244 245_GLOBAL(memcpy) 246 srwi. r7,r5,3 247 addi r6,r3,-4 248 addi r4,r4,-4 249 beq 2f /* if less than 8 bytes to do */ 250 andi. r0,r6,3 /* get dest word aligned */ 251 mtctr r7 252 bne 5f 2531: lwz r7,4(r4) 254 lwzu r8,8(r4) 255 stw r7,4(r6) 256 stwu r8,8(r6) 257 bdnz 1b 258 andi. r5,r5,7 2592: cmplwi 0,r5,4 260 blt 3f 261 lwzu r0,4(r4) 262 addi r5,r5,-4 263 stwu r0,4(r6) 2643: cmpwi 0,r5,0 265 beqlr 266 mtctr r5 267 addi r4,r4,3 268 addi r6,r6,3 2694: lbzu r0,1(r4) 270 stbu r0,1(r6) 271 bdnz 4b 272 blr 2735: subfic r0,r0,4 274 mtctr r0 2756: lbz r7,4(r4) 276 addi r4,r4,1 277 stb r7,4(r6) 278 addi r6,r6,1 279 bdnz 6b 280 subf r5,r0,r5 281 rlwinm. r7,r5,32-3,3,31 282 beq 2b 283 mtctr r7 284 b 1b 285 286_GLOBAL(backwards_memcpy) 287 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 288 add r6,r3,r5 289 add r4,r4,r5 290 beq 2f 291 andi. r0,r6,3 292 mtctr r7 293 bne 5f 2941: lwz r7,-4(r4) 295 lwzu r8,-8(r4) 296 stw r7,-4(r6) 297 stwu r8,-8(r6) 298 bdnz 1b 299 andi. r5,r5,7 3002: cmplwi 0,r5,4 301 blt 3f 302 lwzu r0,-4(r4) 303 subi r5,r5,4 304 stwu r0,-4(r6) 3053: cmpwi 0,r5,0 306 beqlr 307 mtctr r5 3084: lbzu r0,-1(r4) 309 stbu r0,-1(r6) 310 bdnz 4b 311 blr 3125: mtctr r0 3136: lbzu r7,-1(r4) 314 stbu r7,-1(r6) 315 bdnz 6b 316 subf r5,r0,r5 317 rlwinm. r7,r5,32-3,3,31 318 beq 2b 319 mtctr r7 320 b 1b 321 322_GLOBAL(__copy_tofrom_user) 323 addi r4,r4,-4 324 addi r6,r3,-4 325 neg r0,r3 326 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 327 beq 58f 328 329 cmplw 0,r5,r0 /* is this more than total to do? */ 330 blt 63f /* if not much to do */ 331 andi. r8,r0,3 /* get it word-aligned first */ 332 mtctr r8 333 beq+ 61f 33470: lbz r9,4(r4) /* do some bytes */ 33571: stb r9,4(r6) 336 addi r4,r4,1 337 addi r6,r6,1 338 bdnz 70b 33961: subf r5,r0,r5 340 srwi. r0,r0,2 341 mtctr r0 342 beq 58f 34372: lwzu r9,4(r4) /* do some words */ 34473: stwu r9,4(r6) 345 bdnz 72b 346 347 .section __ex_table,"a" 348 .align 2 349 .long 70b,100f 350 .long 71b,101f 351 .long 72b,102f 352 .long 73b,103f 353 .text 354 35558: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 356 clrlwi r5,r5,32-LG_CACHELINE_BYTES 357 li r11,4 358 beq 63f 359 360#ifdef CONFIG_8xx 361 /* Don't use prefetch on 8xx */ 362 mtctr r0 363 li r0,0 36453: COPY_16_BYTES_WITHEX(0) 365 bdnz 53b 366 367#else /* not CONFIG_8xx */ 368 /* Here we decide how far ahead to prefetch the source */ 369 li r3,4 370 cmpwi r0,1 371 li r7,0 372 ble 114f 373 li r7,1 374#if MAX_COPY_PREFETCH > 1 375 /* Heuristically, for large transfers we prefetch 376 MAX_COPY_PREFETCH cachelines ahead. For small transfers 377 we prefetch 1 cacheline ahead. */ 378 cmpwi r0,MAX_COPY_PREFETCH 379 ble 112f 380 li r7,MAX_COPY_PREFETCH 381112: mtctr r7 382111: dcbt r3,r4 383 addi r3,r3,CACHELINE_BYTES 384 bdnz 111b 385#else 386 dcbt r3,r4 387 addi r3,r3,CACHELINE_BYTES 388#endif /* MAX_COPY_PREFETCH > 1 */ 389 390114: subf r8,r7,r0 391 mr r0,r7 392 mtctr r8 393 39453: dcbt r3,r4 39554: dcbz r11,r6 396 .section __ex_table,"a" 397 .align 2 398 .long 54b,105f 399 .text 400/* the main body of the cacheline loop */ 401 COPY_16_BYTES_WITHEX(0) 402#if L1_CACHE_BYTES >= 32 403 COPY_16_BYTES_WITHEX(1) 404#if L1_CACHE_BYTES >= 64 405 COPY_16_BYTES_WITHEX(2) 406 COPY_16_BYTES_WITHEX(3) 407#if L1_CACHE_BYTES >= 128 408 COPY_16_BYTES_WITHEX(4) 409 COPY_16_BYTES_WITHEX(5) 410 COPY_16_BYTES_WITHEX(6) 411 COPY_16_BYTES_WITHEX(7) 412#endif 413#endif 414#endif 415 bdnz 53b 416 cmpwi r0,0 417 li r3,4 418 li r7,0 419 bne 114b 420#endif /* CONFIG_8xx */ 421 42263: srwi. r0,r5,2 423 mtctr r0 424 beq 64f 42530: lwzu r0,4(r4) 42631: stwu r0,4(r6) 427 bdnz 30b 428 42964: andi. r0,r5,3 430 mtctr r0 431 beq+ 65f 43240: lbz r0,4(r4) 43341: stb r0,4(r6) 434 addi r4,r4,1 435 addi r6,r6,1 436 bdnz 40b 43765: li r3,0 438 blr 439 440/* read fault, initial single-byte copy */ 441100: li r9,0 442 b 90f 443/* write fault, initial single-byte copy */ 444101: li r9,1 44590: subf r5,r8,r5 446 li r3,0 447 b 99f 448/* read fault, initial word copy */ 449102: li r9,0 450 b 91f 451/* write fault, initial word copy */ 452103: li r9,1 45391: li r3,2 454 b 99f 455 456/* 457 * this stuff handles faults in the cacheline loop and branches to either 458 * 104f (if in read part) or 105f (if in write part), after updating r5 459 */ 460 COPY_16_BYTES_EXCODE(0) 461#if L1_CACHE_BYTES >= 32 462 COPY_16_BYTES_EXCODE(1) 463#if L1_CACHE_BYTES >= 64 464 COPY_16_BYTES_EXCODE(2) 465 COPY_16_BYTES_EXCODE(3) 466#if L1_CACHE_BYTES >= 128 467 COPY_16_BYTES_EXCODE(4) 468 COPY_16_BYTES_EXCODE(5) 469 COPY_16_BYTES_EXCODE(6) 470 COPY_16_BYTES_EXCODE(7) 471#endif 472#endif 473#endif 474 475/* read fault in cacheline loop */ 476104: li r9,0 477 b 92f 478/* fault on dcbz (effectively a write fault) */ 479/* or write fault in cacheline loop */ 480105: li r9,1 48192: li r3,LG_CACHELINE_BYTES 482 mfctr r8 483 add r0,r0,r8 484 b 106f 485/* read fault in final word loop */ 486108: li r9,0 487 b 93f 488/* write fault in final word loop */ 489109: li r9,1 49093: andi. r5,r5,3 491 li r3,2 492 b 99f 493/* read fault in final byte loop */ 494110: li r9,0 495 b 94f 496/* write fault in final byte loop */ 497111: li r9,1 49894: li r5,0 499 li r3,0 500/* 501 * At this stage the number of bytes not copied is 502 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 503 */ 50499: mfctr r0 505106: slw r3,r0,r3 506 add. r3,r3,r5 507 beq 120f /* shouldn't happen */ 508 cmpwi 0,r9,0 509 bne 120f 510/* for a read fault, first try to continue the copy one byte at a time */ 511 mtctr r3 512130: lbz r0,4(r4) 513131: stb r0,4(r6) 514 addi r4,r4,1 515 addi r6,r6,1 516 bdnz 130b 517/* then clear out the destination: r3 bytes starting at 4(r6) */ 518132: mfctr r3 519 srwi. r0,r3,2 520 li r9,0 521 mtctr r0 522 beq 113f 523112: stwu r9,4(r6) 524 bdnz 112b 525113: andi. r0,r3,3 526 mtctr r0 527 beq 120f 528114: stb r9,4(r6) 529 addi r6,r6,1 530 bdnz 114b 531120: blr 532 533 .section __ex_table,"a" 534 .align 2 535 .long 30b,108b 536 .long 31b,109b 537 .long 40b,110b 538 .long 41b,111b 539 .long 130b,132b 540 .long 131b,120b 541 .long 112b,120b 542 .long 114b,120b 543 .text 544