1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15 16#define COPY_16_BYTES \ 17 lwz r7,4(r4); \ 18 lwz r8,8(r4); \ 19 lwz r9,12(r4); \ 20 lwzu r10,16(r4); \ 21 stw r7,4(r6); \ 22 stw r8,8(r6); \ 23 stw r9,12(r6); \ 24 stwu r10,16(r6) 25 26#define COPY_16_BYTES_WITHEX(n) \ 278 ## n ## 0: \ 28 lwz r7,4(r4); \ 298 ## n ## 1: \ 30 lwz r8,8(r4); \ 318 ## n ## 2: \ 32 lwz r9,12(r4); \ 338 ## n ## 3: \ 34 lwzu r10,16(r4); \ 358 ## n ## 4: \ 36 stw r7,4(r6); \ 378 ## n ## 5: \ 38 stw r8,8(r6); \ 398 ## n ## 6: \ 40 stw r9,12(r6); \ 418 ## n ## 7: \ 42 stwu r10,16(r6) 43 44#define COPY_16_BYTES_EXCODE(n) \ 459 ## n ## 0: \ 46 addi r5,r5,-(16 * n); \ 47 b 104f; \ 489 ## n ## 1: \ 49 addi r5,r5,-(16 * n); \ 50 b 105f; \ 51.section __ex_table,"a"; \ 52 .align 2; \ 53 .long 8 ## n ## 0b,9 ## n ## 0b; \ 54 .long 8 ## n ## 1b,9 ## n ## 0b; \ 55 .long 8 ## n ## 2b,9 ## n ## 0b; \ 56 .long 8 ## n ## 3b,9 ## n ## 0b; \ 57 .long 8 ## n ## 4b,9 ## n ## 1b; \ 58 .long 8 ## n ## 5b,9 ## n ## 1b; \ 59 .long 8 ## n ## 6b,9 ## n ## 1b; \ 60 .long 8 ## n ## 7b,9 ## n ## 1b; \ 61 .text 62 63 .text 64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 65 .stabs "copy32.S",N_SO,0,0,0f 660: 67 68CACHELINE_BYTES = L1_CACHE_BYTES 69LG_CACHELINE_BYTES = L1_CACHE_SHIFT 70CACHELINE_MASK = (L1_CACHE_BYTES-1) 71 72/* 73 * Use dcbz on the complete cache lines in the destination 74 * to set them to zero. This requires that the destination 75 * area is cacheable. -- paulus 76 */ 77_GLOBAL(cacheable_memzero) 78 mr r5,r4 79 li r4,0 80 addi r6,r3,-4 81 cmplwi 0,r5,4 82 blt 7f 83 stwu r4,4(r6) 84 beqlr 85 andi. r0,r6,3 86 add r5,r0,r5 87 subf r6,r0,r6 88 clrlwi r7,r6,32-LG_CACHELINE_BYTES 89 add r8,r7,r5 90 srwi r9,r8,LG_CACHELINE_BYTES 91 addic. r9,r9,-1 /* total number of complete cachelines */ 92 ble 2f 93 xori r0,r7,CACHELINE_MASK & ~3 94 srwi. r0,r0,2 95 beq 3f 96 mtctr r0 974: stwu r4,4(r6) 98 bdnz 4b 993: mtctr r9 100 li r7,4 101#if !defined(CONFIG_8xx) 10210: dcbz r7,r6 103#else 10410: stw r4, 4(r6) 105 stw r4, 8(r6) 106 stw r4, 12(r6) 107 stw r4, 16(r6) 108#if CACHE_LINE_SIZE >= 32 109 stw r4, 20(r6) 110 stw r4, 24(r6) 111 stw r4, 28(r6) 112 stw r4, 32(r6) 113#endif /* CACHE_LINE_SIZE */ 114#endif 115 addi r6,r6,CACHELINE_BYTES 116 bdnz 10b 117 clrlwi r5,r8,32-LG_CACHELINE_BYTES 118 addi r5,r5,4 1192: srwi r0,r5,2 120 mtctr r0 121 bdz 6f 1221: stwu r4,4(r6) 123 bdnz 1b 1246: andi. r5,r5,3 1257: cmpwi 0,r5,0 126 beqlr 127 mtctr r5 128 addi r6,r6,3 1298: stbu r4,1(r6) 130 bdnz 8b 131 blr 132 133_GLOBAL(memset) 134 rlwimi r4,r4,8,16,23 135 rlwimi r4,r4,16,0,15 136 addi r6,r3,-4 137 cmplwi 0,r5,4 138 blt 7f 139 stwu r4,4(r6) 140 beqlr 141 andi. r0,r6,3 142 add r5,r0,r5 143 subf r6,r0,r6 144 srwi r0,r5,2 145 mtctr r0 146 bdz 6f 1471: stwu r4,4(r6) 148 bdnz 1b 1496: andi. r5,r5,3 1507: cmpwi 0,r5,0 151 beqlr 152 mtctr r5 153 addi r6,r6,3 1548: stbu r4,1(r6) 155 bdnz 8b 156 blr 157 158/* 159 * This version uses dcbz on the complete cache lines in the 160 * destination area to reduce memory traffic. This requires that 161 * the destination area is cacheable. 162 * We only use this version if the source and dest don't overlap. 163 * -- paulus. 164 */ 165_GLOBAL(cacheable_memcpy) 166 add r7,r3,r5 /* test if the src & dst overlap */ 167 add r8,r4,r5 168 cmplw 0,r4,r7 169 cmplw 1,r3,r8 170 crand 0,0,4 /* cr0.lt &= cr1.lt */ 171 blt memcpy /* if regions overlap */ 172 173 addi r4,r4,-4 174 addi r6,r3,-4 175 neg r0,r3 176 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 177 beq 58f 178 179 cmplw 0,r5,r0 /* is this more than total to do? */ 180 blt 63f /* if not much to do */ 181 andi. r8,r0,3 /* get it word-aligned first */ 182 subf r5,r0,r5 183 mtctr r8 184 beq+ 61f 18570: lbz r9,4(r4) /* do some bytes */ 186 stb r9,4(r6) 187 addi r4,r4,1 188 addi r6,r6,1 189 bdnz 70b 19061: srwi. r0,r0,2 191 mtctr r0 192 beq 58f 19372: lwzu r9,4(r4) /* do some words */ 194 stwu r9,4(r6) 195 bdnz 72b 196 19758: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 198 clrlwi r5,r5,32-LG_CACHELINE_BYTES 199 li r11,4 200 mtctr r0 201 beq 63f 20253: 203#if !defined(CONFIG_8xx) 204 dcbz r11,r6 205#endif 206 COPY_16_BYTES 207#if L1_CACHE_BYTES >= 32 208 COPY_16_BYTES 209#if L1_CACHE_BYTES >= 64 210 COPY_16_BYTES 211 COPY_16_BYTES 212#if L1_CACHE_BYTES >= 128 213 COPY_16_BYTES 214 COPY_16_BYTES 215 COPY_16_BYTES 216 COPY_16_BYTES 217#endif 218#endif 219#endif 220 bdnz 53b 221 22263: srwi. r0,r5,2 223 mtctr r0 224 beq 64f 22530: lwzu r0,4(r4) 226 stwu r0,4(r6) 227 bdnz 30b 228 22964: andi. r0,r5,3 230 mtctr r0 231 beq+ 65f 23240: lbz r0,4(r4) 233 stb r0,4(r6) 234 addi r4,r4,1 235 addi r6,r6,1 236 bdnz 40b 23765: blr 238 239_GLOBAL(memmove) 240 cmplw 0,r3,r4 241 bgt backwards_memcpy 242 /* fall through */ 243 244_GLOBAL(memcpy) 245 srwi. r7,r5,3 246 addi r6,r3,-4 247 addi r4,r4,-4 248 beq 2f /* if less than 8 bytes to do */ 249 andi. r0,r6,3 /* get dest word aligned */ 250 mtctr r7 251 bne 5f 2521: lwz r7,4(r4) 253 lwzu r8,8(r4) 254 stw r7,4(r6) 255 stwu r8,8(r6) 256 bdnz 1b 257 andi. r5,r5,7 2582: cmplwi 0,r5,4 259 blt 3f 260 lwzu r0,4(r4) 261 addi r5,r5,-4 262 stwu r0,4(r6) 2633: cmpwi 0,r5,0 264 beqlr 265 mtctr r5 266 addi r4,r4,3 267 addi r6,r6,3 2684: lbzu r0,1(r4) 269 stbu r0,1(r6) 270 bdnz 4b 271 blr 2725: subfic r0,r0,4 273 mtctr r0 2746: lbz r7,4(r4) 275 addi r4,r4,1 276 stb r7,4(r6) 277 addi r6,r6,1 278 bdnz 6b 279 subf r5,r0,r5 280 rlwinm. r7,r5,32-3,3,31 281 beq 2b 282 mtctr r7 283 b 1b 284 285_GLOBAL(backwards_memcpy) 286 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 287 add r6,r3,r5 288 add r4,r4,r5 289 beq 2f 290 andi. r0,r6,3 291 mtctr r7 292 bne 5f 2931: lwz r7,-4(r4) 294 lwzu r8,-8(r4) 295 stw r7,-4(r6) 296 stwu r8,-8(r6) 297 bdnz 1b 298 andi. r5,r5,7 2992: cmplwi 0,r5,4 300 blt 3f 301 lwzu r0,-4(r4) 302 subi r5,r5,4 303 stwu r0,-4(r6) 3043: cmpwi 0,r5,0 305 beqlr 306 mtctr r5 3074: lbzu r0,-1(r4) 308 stbu r0,-1(r6) 309 bdnz 4b 310 blr 3115: mtctr r0 3126: lbzu r7,-1(r4) 313 stbu r7,-1(r6) 314 bdnz 6b 315 subf r5,r0,r5 316 rlwinm. r7,r5,32-3,3,31 317 beq 2b 318 mtctr r7 319 b 1b 320 321_GLOBAL(__copy_tofrom_user) 322 addi r4,r4,-4 323 addi r6,r3,-4 324 neg r0,r3 325 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 326 beq 58f 327 328 cmplw 0,r5,r0 /* is this more than total to do? */ 329 blt 63f /* if not much to do */ 330 andi. r8,r0,3 /* get it word-aligned first */ 331 mtctr r8 332 beq+ 61f 33370: lbz r9,4(r4) /* do some bytes */ 33471: stb r9,4(r6) 335 addi r4,r4,1 336 addi r6,r6,1 337 bdnz 70b 33861: subf r5,r0,r5 339 srwi. r0,r0,2 340 mtctr r0 341 beq 58f 34272: lwzu r9,4(r4) /* do some words */ 34373: stwu r9,4(r6) 344 bdnz 72b 345 346 .section __ex_table,"a" 347 .align 2 348 .long 70b,100f 349 .long 71b,101f 350 .long 72b,102f 351 .long 73b,103f 352 .text 353 35458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 355 clrlwi r5,r5,32-LG_CACHELINE_BYTES 356 li r11,4 357 beq 63f 358 359#ifdef CONFIG_8xx 360 /* Don't use prefetch on 8xx */ 361 mtctr r0 362 li r0,0 36353: COPY_16_BYTES_WITHEX(0) 364 bdnz 53b 365 366#else /* not CONFIG_8xx */ 367 /* Here we decide how far ahead to prefetch the source */ 368 li r3,4 369 cmpwi r0,1 370 li r7,0 371 ble 114f 372 li r7,1 373#if MAX_COPY_PREFETCH > 1 374 /* Heuristically, for large transfers we prefetch 375 MAX_COPY_PREFETCH cachelines ahead. For small transfers 376 we prefetch 1 cacheline ahead. */ 377 cmpwi r0,MAX_COPY_PREFETCH 378 ble 112f 379 li r7,MAX_COPY_PREFETCH 380112: mtctr r7 381111: dcbt r3,r4 382 addi r3,r3,CACHELINE_BYTES 383 bdnz 111b 384#else 385 dcbt r3,r4 386 addi r3,r3,CACHELINE_BYTES 387#endif /* MAX_COPY_PREFETCH > 1 */ 388 389114: subf r8,r7,r0 390 mr r0,r7 391 mtctr r8 392 39353: dcbt r3,r4 39454: dcbz r11,r6 395 .section __ex_table,"a" 396 .align 2 397 .long 54b,105f 398 .text 399/* the main body of the cacheline loop */ 400 COPY_16_BYTES_WITHEX(0) 401#if L1_CACHE_BYTES >= 32 402 COPY_16_BYTES_WITHEX(1) 403#if L1_CACHE_BYTES >= 64 404 COPY_16_BYTES_WITHEX(2) 405 COPY_16_BYTES_WITHEX(3) 406#if L1_CACHE_BYTES >= 128 407 COPY_16_BYTES_WITHEX(4) 408 COPY_16_BYTES_WITHEX(5) 409 COPY_16_BYTES_WITHEX(6) 410 COPY_16_BYTES_WITHEX(7) 411#endif 412#endif 413#endif 414 bdnz 53b 415 cmpwi r0,0 416 li r3,4 417 li r7,0 418 bne 114b 419#endif /* CONFIG_8xx */ 420 42163: srwi. r0,r5,2 422 mtctr r0 423 beq 64f 42430: lwzu r0,4(r4) 42531: stwu r0,4(r6) 426 bdnz 30b 427 42864: andi. r0,r5,3 429 mtctr r0 430 beq+ 65f 43140: lbz r0,4(r4) 43241: stb r0,4(r6) 433 addi r4,r4,1 434 addi r6,r6,1 435 bdnz 40b 43665: li r3,0 437 blr 438 439/* read fault, initial single-byte copy */ 440100: li r9,0 441 b 90f 442/* write fault, initial single-byte copy */ 443101: li r9,1 44490: subf r5,r8,r5 445 li r3,0 446 b 99f 447/* read fault, initial word copy */ 448102: li r9,0 449 b 91f 450/* write fault, initial word copy */ 451103: li r9,1 45291: li r3,2 453 b 99f 454 455/* 456 * this stuff handles faults in the cacheline loop and branches to either 457 * 104f (if in read part) or 105f (if in write part), after updating r5 458 */ 459 COPY_16_BYTES_EXCODE(0) 460#if L1_CACHE_BYTES >= 32 461 COPY_16_BYTES_EXCODE(1) 462#if L1_CACHE_BYTES >= 64 463 COPY_16_BYTES_EXCODE(2) 464 COPY_16_BYTES_EXCODE(3) 465#if L1_CACHE_BYTES >= 128 466 COPY_16_BYTES_EXCODE(4) 467 COPY_16_BYTES_EXCODE(5) 468 COPY_16_BYTES_EXCODE(6) 469 COPY_16_BYTES_EXCODE(7) 470#endif 471#endif 472#endif 473 474/* read fault in cacheline loop */ 475104: li r9,0 476 b 92f 477/* fault on dcbz (effectively a write fault) */ 478/* or write fault in cacheline loop */ 479105: li r9,1 48092: li r3,LG_CACHELINE_BYTES 481 mfctr r8 482 add r0,r0,r8 483 b 106f 484/* read fault in final word loop */ 485108: li r9,0 486 b 93f 487/* write fault in final word loop */ 488109: li r9,1 48993: andi. r5,r5,3 490 li r3,2 491 b 99f 492/* read fault in final byte loop */ 493110: li r9,0 494 b 94f 495/* write fault in final byte loop */ 496111: li r9,1 49794: li r5,0 498 li r3,0 499/* 500 * At this stage the number of bytes not copied is 501 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 502 */ 50399: mfctr r0 504106: slw r3,r0,r3 505 add. r3,r3,r5 506 beq 120f /* shouldn't happen */ 507 cmpwi 0,r9,0 508 bne 120f 509/* for a read fault, first try to continue the copy one byte at a time */ 510 mtctr r3 511130: lbz r0,4(r4) 512131: stb r0,4(r6) 513 addi r4,r4,1 514 addi r6,r6,1 515 bdnz 130b 516/* then clear out the destination: r3 bytes starting at 4(r6) */ 517132: mfctr r3 518 srwi. r0,r3,2 519 li r9,0 520 mtctr r0 521 beq 113f 522112: stwu r9,4(r6) 523 bdnz 112b 524113: andi. r0,r3,3 525 mtctr r0 526 beq 120f 527114: stb r9,4(r6) 528 addi r6,r6,1 529 bdnz 114b 530120: blr 531 532 .section __ex_table,"a" 533 .align 2 534 .long 30b,108b 535 .long 31b,109b 536 .long 40b,110b 537 .long 41b,111b 538 .long 130b,132b 539 .long 131b,120b 540 .long 112b,120b 541 .long 114b,120b 542 .text 543