1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15 16#define COPY_16_BYTES \ 17 lwz r7,4(r4); \ 18 lwz r8,8(r4); \ 19 lwz r9,12(r4); \ 20 lwzu r10,16(r4); \ 21 stw r7,4(r6); \ 22 stw r8,8(r6); \ 23 stw r9,12(r6); \ 24 stwu r10,16(r6) 25 26#define COPY_16_BYTES_WITHEX(n) \ 278 ## n ## 0: \ 28 lwz r7,4(r4); \ 298 ## n ## 1: \ 30 lwz r8,8(r4); \ 318 ## n ## 2: \ 32 lwz r9,12(r4); \ 338 ## n ## 3: \ 34 lwzu r10,16(r4); \ 358 ## n ## 4: \ 36 stw r7,4(r6); \ 378 ## n ## 5: \ 38 stw r8,8(r6); \ 398 ## n ## 6: \ 40 stw r9,12(r6); \ 418 ## n ## 7: \ 42 stwu r10,16(r6) 43 44#define COPY_16_BYTES_EXCODE(n) \ 459 ## n ## 0: \ 46 addi r5,r5,-(16 * n); \ 47 b 104f; \ 489 ## n ## 1: \ 49 addi r5,r5,-(16 * n); \ 50 b 105f; \ 51.section __ex_table,"a"; \ 52 .align 2; \ 53 .long 8 ## n ## 0b,9 ## n ## 0b; \ 54 .long 8 ## n ## 1b,9 ## n ## 0b; \ 55 .long 8 ## n ## 2b,9 ## n ## 0b; \ 56 .long 8 ## n ## 3b,9 ## n ## 0b; \ 57 .long 8 ## n ## 4b,9 ## n ## 1b; \ 58 .long 8 ## n ## 5b,9 ## n ## 1b; \ 59 .long 8 ## n ## 6b,9 ## n ## 1b; \ 60 .long 8 ## n ## 7b,9 ## n ## 1b; \ 61 .text 62 63 .text 64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 65 .stabs "copy_32.S",N_SO,0,0,0f 660: 67 68CACHELINE_BYTES = L1_CACHE_BYTES 69LG_CACHELINE_BYTES = L1_CACHE_SHIFT 70CACHELINE_MASK = (L1_CACHE_BYTES-1) 71 72/* 73 * Use dcbz on the complete cache lines in the destination 74 * to set them to zero. This requires that the destination 75 * area is cacheable. -- paulus 76 */ 77_GLOBAL(memset) 78 rlwimi r4,r4,8,16,23 79 rlwimi r4,r4,16,0,15 80 81 addi r6,r3,-4 82 cmplwi 0,r5,4 83 blt 7f 84 stwu r4,4(r6) 85 beqlr 86 andi. r0,r6,3 87 add r5,r0,r5 88 subf r6,r0,r6 89 cmplwi 0,r4,0 90 bne 2f /* Use normal procedure if r4 is not zero */ 91 92 clrlwi r7,r6,32-LG_CACHELINE_BYTES 93 add r8,r7,r5 94 srwi r9,r8,LG_CACHELINE_BYTES 95 addic. r9,r9,-1 /* total number of complete cachelines */ 96 ble 2f 97 xori r0,r7,CACHELINE_MASK & ~3 98 srwi. r0,r0,2 99 beq 3f 100 mtctr r0 1014: stwu r4,4(r6) 102 bdnz 4b 1033: mtctr r9 104 li r7,4 10510: dcbz r7,r6 106 addi r6,r6,CACHELINE_BYTES 107 bdnz 10b 108 clrlwi r5,r8,32-LG_CACHELINE_BYTES 109 addi r5,r5,4 110 1112: srwi r0,r5,2 112 mtctr r0 113 bdz 6f 1141: stwu r4,4(r6) 115 bdnz 1b 1166: andi. r5,r5,3 1177: cmpwi 0,r5,0 118 beqlr 119 mtctr r5 120 addi r6,r6,3 1218: stbu r4,1(r6) 122 bdnz 8b 123 blr 124 125/* 126 * This version uses dcbz on the complete cache lines in the 127 * destination area to reduce memory traffic. This requires that 128 * the destination area is cacheable. 129 * We only use this version if the source and dest don't overlap. 130 * -- paulus. 131 */ 132_GLOBAL(memmove) 133 cmplw 0,r3,r4 134 bgt backwards_memcpy 135 /* fall through */ 136 137_GLOBAL(memcpy) 138 add r7,r3,r5 /* test if the src & dst overlap */ 139 add r8,r4,r5 140 cmplw 0,r4,r7 141 cmplw 1,r3,r8 142 crand 0,0,4 /* cr0.lt &= cr1.lt */ 143 blt generic_memcpy /* if regions overlap */ 144 145 addi r4,r4,-4 146 addi r6,r3,-4 147 neg r0,r3 148 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 149 beq 58f 150 151 cmplw 0,r5,r0 /* is this more than total to do? */ 152 blt 63f /* if not much to do */ 153 andi. r8,r0,3 /* get it word-aligned first */ 154 subf r5,r0,r5 155 mtctr r8 156 beq+ 61f 15770: lbz r9,4(r4) /* do some bytes */ 158 addi r4,r4,1 159 addi r6,r6,1 160 stb r9,3(r6) 161 bdnz 70b 16261: srwi. r0,r0,2 163 mtctr r0 164 beq 58f 16572: lwzu r9,4(r4) /* do some words */ 166 stwu r9,4(r6) 167 bdnz 72b 168 16958: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 170 clrlwi r5,r5,32-LG_CACHELINE_BYTES 171 li r11,4 172 mtctr r0 173 beq 63f 17453: 175 dcbz r11,r6 176 COPY_16_BYTES 177#if L1_CACHE_BYTES >= 32 178 COPY_16_BYTES 179#if L1_CACHE_BYTES >= 64 180 COPY_16_BYTES 181 COPY_16_BYTES 182#if L1_CACHE_BYTES >= 128 183 COPY_16_BYTES 184 COPY_16_BYTES 185 COPY_16_BYTES 186 COPY_16_BYTES 187#endif 188#endif 189#endif 190 bdnz 53b 191 19263: srwi. r0,r5,2 193 mtctr r0 194 beq 64f 19530: lwzu r0,4(r4) 196 stwu r0,4(r6) 197 bdnz 30b 198 19964: andi. r0,r5,3 200 mtctr r0 201 beq+ 65f 202 addi r4,r4,3 203 addi r6,r6,3 20440: lbzu r0,1(r4) 205 stbu r0,1(r6) 206 bdnz 40b 20765: blr 208 209_GLOBAL(generic_memcpy) 210 srwi. r7,r5,3 211 addi r6,r3,-4 212 addi r4,r4,-4 213 beq 2f /* if less than 8 bytes to do */ 214 andi. r0,r6,3 /* get dest word aligned */ 215 mtctr r7 216 bne 5f 2171: lwz r7,4(r4) 218 lwzu r8,8(r4) 219 stw r7,4(r6) 220 stwu r8,8(r6) 221 bdnz 1b 222 andi. r5,r5,7 2232: cmplwi 0,r5,4 224 blt 3f 225 lwzu r0,4(r4) 226 addi r5,r5,-4 227 stwu r0,4(r6) 2283: cmpwi 0,r5,0 229 beqlr 230 mtctr r5 231 addi r4,r4,3 232 addi r6,r6,3 2334: lbzu r0,1(r4) 234 stbu r0,1(r6) 235 bdnz 4b 236 blr 2375: subfic r0,r0,4 238 mtctr r0 2396: lbz r7,4(r4) 240 addi r4,r4,1 241 stb r7,4(r6) 242 addi r6,r6,1 243 bdnz 6b 244 subf r5,r0,r5 245 rlwinm. r7,r5,32-3,3,31 246 beq 2b 247 mtctr r7 248 b 1b 249 250_GLOBAL(backwards_memcpy) 251 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 252 add r6,r3,r5 253 add r4,r4,r5 254 beq 2f 255 andi. r0,r6,3 256 mtctr r7 257 bne 5f 2581: lwz r7,-4(r4) 259 lwzu r8,-8(r4) 260 stw r7,-4(r6) 261 stwu r8,-8(r6) 262 bdnz 1b 263 andi. r5,r5,7 2642: cmplwi 0,r5,4 265 blt 3f 266 lwzu r0,-4(r4) 267 subi r5,r5,4 268 stwu r0,-4(r6) 2693: cmpwi 0,r5,0 270 beqlr 271 mtctr r5 2724: lbzu r0,-1(r4) 273 stbu r0,-1(r6) 274 bdnz 4b 275 blr 2765: mtctr r0 2776: lbzu r7,-1(r4) 278 stbu r7,-1(r6) 279 bdnz 6b 280 subf r5,r0,r5 281 rlwinm. r7,r5,32-3,3,31 282 beq 2b 283 mtctr r7 284 b 1b 285 286_GLOBAL(__copy_tofrom_user) 287 addi r4,r4,-4 288 addi r6,r3,-4 289 neg r0,r3 290 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 291 beq 58f 292 293 cmplw 0,r5,r0 /* is this more than total to do? */ 294 blt 63f /* if not much to do */ 295 andi. r8,r0,3 /* get it word-aligned first */ 296 mtctr r8 297 beq+ 61f 29870: lbz r9,4(r4) /* do some bytes */ 29971: stb r9,4(r6) 300 addi r4,r4,1 301 addi r6,r6,1 302 bdnz 70b 30361: subf r5,r0,r5 304 srwi. r0,r0,2 305 mtctr r0 306 beq 58f 30772: lwzu r9,4(r4) /* do some words */ 30873: stwu r9,4(r6) 309 bdnz 72b 310 311 .section __ex_table,"a" 312 .align 2 313 .long 70b,100f 314 .long 71b,101f 315 .long 72b,102f 316 .long 73b,103f 317 .text 318 31958: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 320 clrlwi r5,r5,32-LG_CACHELINE_BYTES 321 li r11,4 322 beq 63f 323 324 /* Here we decide how far ahead to prefetch the source */ 325 li r3,4 326 cmpwi r0,1 327 li r7,0 328 ble 114f 329 li r7,1 330#if MAX_COPY_PREFETCH > 1 331 /* Heuristically, for large transfers we prefetch 332 MAX_COPY_PREFETCH cachelines ahead. For small transfers 333 we prefetch 1 cacheline ahead. */ 334 cmpwi r0,MAX_COPY_PREFETCH 335 ble 112f 336 li r7,MAX_COPY_PREFETCH 337112: mtctr r7 338111: dcbt r3,r4 339 addi r3,r3,CACHELINE_BYTES 340 bdnz 111b 341#else 342 dcbt r3,r4 343 addi r3,r3,CACHELINE_BYTES 344#endif /* MAX_COPY_PREFETCH > 1 */ 345 346114: subf r8,r7,r0 347 mr r0,r7 348 mtctr r8 349 35053: dcbt r3,r4 35154: dcbz r11,r6 352 .section __ex_table,"a" 353 .align 2 354 .long 54b,105f 355 .text 356/* the main body of the cacheline loop */ 357 COPY_16_BYTES_WITHEX(0) 358#if L1_CACHE_BYTES >= 32 359 COPY_16_BYTES_WITHEX(1) 360#if L1_CACHE_BYTES >= 64 361 COPY_16_BYTES_WITHEX(2) 362 COPY_16_BYTES_WITHEX(3) 363#if L1_CACHE_BYTES >= 128 364 COPY_16_BYTES_WITHEX(4) 365 COPY_16_BYTES_WITHEX(5) 366 COPY_16_BYTES_WITHEX(6) 367 COPY_16_BYTES_WITHEX(7) 368#endif 369#endif 370#endif 371 bdnz 53b 372 cmpwi r0,0 373 li r3,4 374 li r7,0 375 bne 114b 376 37763: srwi. r0,r5,2 378 mtctr r0 379 beq 64f 38030: lwzu r0,4(r4) 38131: stwu r0,4(r6) 382 bdnz 30b 383 38464: andi. r0,r5,3 385 mtctr r0 386 beq+ 65f 38740: lbz r0,4(r4) 38841: stb r0,4(r6) 389 addi r4,r4,1 390 addi r6,r6,1 391 bdnz 40b 39265: li r3,0 393 blr 394 395/* read fault, initial single-byte copy */ 396100: li r9,0 397 b 90f 398/* write fault, initial single-byte copy */ 399101: li r9,1 40090: subf r5,r8,r5 401 li r3,0 402 b 99f 403/* read fault, initial word copy */ 404102: li r9,0 405 b 91f 406/* write fault, initial word copy */ 407103: li r9,1 40891: li r3,2 409 b 99f 410 411/* 412 * this stuff handles faults in the cacheline loop and branches to either 413 * 104f (if in read part) or 105f (if in write part), after updating r5 414 */ 415 COPY_16_BYTES_EXCODE(0) 416#if L1_CACHE_BYTES >= 32 417 COPY_16_BYTES_EXCODE(1) 418#if L1_CACHE_BYTES >= 64 419 COPY_16_BYTES_EXCODE(2) 420 COPY_16_BYTES_EXCODE(3) 421#if L1_CACHE_BYTES >= 128 422 COPY_16_BYTES_EXCODE(4) 423 COPY_16_BYTES_EXCODE(5) 424 COPY_16_BYTES_EXCODE(6) 425 COPY_16_BYTES_EXCODE(7) 426#endif 427#endif 428#endif 429 430/* read fault in cacheline loop */ 431104: li r9,0 432 b 92f 433/* fault on dcbz (effectively a write fault) */ 434/* or write fault in cacheline loop */ 435105: li r9,1 43692: li r3,LG_CACHELINE_BYTES 437 mfctr r8 438 add r0,r0,r8 439 b 106f 440/* read fault in final word loop */ 441108: li r9,0 442 b 93f 443/* write fault in final word loop */ 444109: li r9,1 44593: andi. r5,r5,3 446 li r3,2 447 b 99f 448/* read fault in final byte loop */ 449110: li r9,0 450 b 94f 451/* write fault in final byte loop */ 452111: li r9,1 45394: li r5,0 454 li r3,0 455/* 456 * At this stage the number of bytes not copied is 457 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 458 */ 45999: mfctr r0 460106: slw r3,r0,r3 461 add. r3,r3,r5 462 beq 120f /* shouldn't happen */ 463 cmpwi 0,r9,0 464 bne 120f 465/* for a read fault, first try to continue the copy one byte at a time */ 466 mtctr r3 467130: lbz r0,4(r4) 468131: stb r0,4(r6) 469 addi r4,r4,1 470 addi r6,r6,1 471 bdnz 130b 472/* then clear out the destination: r3 bytes starting at 4(r6) */ 473132: mfctr r3 474 srwi. r0,r3,2 475 li r9,0 476 mtctr r0 477 beq 113f 478112: stwu r9,4(r6) 479 bdnz 112b 480113: andi. r0,r3,3 481 mtctr r0 482 beq 120f 483114: stb r9,4(r6) 484 addi r6,r6,1 485 bdnz 114b 486120: blr 487 488 .section __ex_table,"a" 489 .align 2 490 .long 30b,108b 491 .long 31b,109b 492 .long 40b,110b 493 .long 41b,111b 494 .long 130b,132b 495 .long 131b,120b 496 .long 112b,120b 497 .long 114b,120b 498 .text 499