1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15#include <asm/export.h> 16 17#define COPY_16_BYTES \ 18 lwz r7,4(r4); \ 19 lwz r8,8(r4); \ 20 lwz r9,12(r4); \ 21 lwzu r10,16(r4); \ 22 stw r7,4(r6); \ 23 stw r8,8(r6); \ 24 stw r9,12(r6); \ 25 stwu r10,16(r6) 26 27#define COPY_16_BYTES_WITHEX(n) \ 288 ## n ## 0: \ 29 lwz r7,4(r4); \ 308 ## n ## 1: \ 31 lwz r8,8(r4); \ 328 ## n ## 2: \ 33 lwz r9,12(r4); \ 348 ## n ## 3: \ 35 lwzu r10,16(r4); \ 368 ## n ## 4: \ 37 stw r7,4(r6); \ 388 ## n ## 5: \ 39 stw r8,8(r6); \ 408 ## n ## 6: \ 41 stw r9,12(r6); \ 428 ## n ## 7: \ 43 stwu r10,16(r6) 44 45#define COPY_16_BYTES_EXCODE(n) \ 469 ## n ## 0: \ 47 addi r5,r5,-(16 * n); \ 48 b 104f; \ 499 ## n ## 1: \ 50 addi r5,r5,-(16 * n); \ 51 b 105f; \ 52 EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \ 53 EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \ 54 EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \ 55 EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \ 56 EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \ 57 EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \ 58 EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \ 59 EX_TABLE(8 ## n ## 7b,9 ## n ## 1b) 60 61 .text 62 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 63 .stabs "copy_32.S",N_SO,0,0,0f 640: 65 66CACHELINE_BYTES = L1_CACHE_BYTES 67LG_CACHELINE_BYTES = L1_CACHE_SHIFT 68CACHELINE_MASK = (L1_CACHE_BYTES-1) 69 70/* 71 * Use dcbz on the complete cache lines in the destination 72 * to set them to zero. This requires that the destination 73 * area is cacheable. -- paulus 74 * 75 * During early init, cache might not be active yet, so dcbz cannot be used. 76 * We therefore skip the optimised bloc that uses dcbz. This jump is 77 * replaced by a nop once cache is active. This is done in machine_init() 78 */ 79_GLOBAL(memset) 80 rlwimi r4,r4,8,16,23 81 rlwimi r4,r4,16,0,15 82 83 addi r6,r3,-4 84 cmplwi 0,r5,4 85 blt 7f 86 stwu r4,4(r6) 87 beqlr 88 andi. r0,r6,3 89 add r5,r0,r5 90 subf r6,r0,r6 91 cmplwi 0,r4,0 92 bne 2f /* Use normal procedure if r4 is not zero */ 93EXPORT_SYMBOL(memset) 94_GLOBAL(memset_nocache_branch) 95 b 2f /* Skip optimised bloc until cache is enabled */ 96 97 clrlwi r7,r6,32-LG_CACHELINE_BYTES 98 add r8,r7,r5 99 srwi r9,r8,LG_CACHELINE_BYTES 100 addic. r9,r9,-1 /* total number of complete cachelines */ 101 ble 2f 102 xori r0,r7,CACHELINE_MASK & ~3 103 srwi. r0,r0,2 104 beq 3f 105 mtctr r0 1064: stwu r4,4(r6) 107 bdnz 4b 1083: mtctr r9 109 li r7,4 11010: dcbz r7,r6 111 addi r6,r6,CACHELINE_BYTES 112 bdnz 10b 113 clrlwi r5,r8,32-LG_CACHELINE_BYTES 114 addi r5,r5,4 115 1162: srwi r0,r5,2 117 mtctr r0 118 bdz 6f 1191: stwu r4,4(r6) 120 bdnz 1b 1216: andi. r5,r5,3 1227: cmpwi 0,r5,0 123 beqlr 124 mtctr r5 125 addi r6,r6,3 1268: stbu r4,1(r6) 127 bdnz 8b 128 blr 129 130/* 131 * This version uses dcbz on the complete cache lines in the 132 * destination area to reduce memory traffic. This requires that 133 * the destination area is cacheable. 134 * We only use this version if the source and dest don't overlap. 135 * -- paulus. 136 * 137 * During early init, cache might not be active yet, so dcbz cannot be used. 138 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is 139 * replaced by a nop once cache is active. This is done in machine_init() 140 */ 141_GLOBAL(memmove) 142 cmplw 0,r3,r4 143 bgt backwards_memcpy 144 /* fall through */ 145 146_GLOBAL(memcpy) 147 b generic_memcpy 148 add r7,r3,r5 /* test if the src & dst overlap */ 149 add r8,r4,r5 150 cmplw 0,r4,r7 151 cmplw 1,r3,r8 152 crand 0,0,4 /* cr0.lt &= cr1.lt */ 153 blt generic_memcpy /* if regions overlap */ 154 155 addi r4,r4,-4 156 addi r6,r3,-4 157 neg r0,r3 158 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 159 beq 58f 160 161 cmplw 0,r5,r0 /* is this more than total to do? */ 162 blt 63f /* if not much to do */ 163 andi. r8,r0,3 /* get it word-aligned first */ 164 subf r5,r0,r5 165 mtctr r8 166 beq+ 61f 16770: lbz r9,4(r4) /* do some bytes */ 168 addi r4,r4,1 169 addi r6,r6,1 170 stb r9,3(r6) 171 bdnz 70b 17261: srwi. r0,r0,2 173 mtctr r0 174 beq 58f 17572: lwzu r9,4(r4) /* do some words */ 176 stwu r9,4(r6) 177 bdnz 72b 178 17958: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 180 clrlwi r5,r5,32-LG_CACHELINE_BYTES 181 li r11,4 182 mtctr r0 183 beq 63f 18453: 185 dcbz r11,r6 186 COPY_16_BYTES 187#if L1_CACHE_BYTES >= 32 188 COPY_16_BYTES 189#if L1_CACHE_BYTES >= 64 190 COPY_16_BYTES 191 COPY_16_BYTES 192#if L1_CACHE_BYTES >= 128 193 COPY_16_BYTES 194 COPY_16_BYTES 195 COPY_16_BYTES 196 COPY_16_BYTES 197#endif 198#endif 199#endif 200 bdnz 53b 201 20263: srwi. r0,r5,2 203 mtctr r0 204 beq 64f 20530: lwzu r0,4(r4) 206 stwu r0,4(r6) 207 bdnz 30b 208 20964: andi. r0,r5,3 210 mtctr r0 211 beq+ 65f 212 addi r4,r4,3 213 addi r6,r6,3 21440: lbzu r0,1(r4) 215 stbu r0,1(r6) 216 bdnz 40b 21765: blr 218EXPORT_SYMBOL(memcpy) 219EXPORT_SYMBOL(memmove) 220 221generic_memcpy: 222 srwi. r7,r5,3 223 addi r6,r3,-4 224 addi r4,r4,-4 225 beq 2f /* if less than 8 bytes to do */ 226 andi. r0,r6,3 /* get dest word aligned */ 227 mtctr r7 228 bne 5f 2291: lwz r7,4(r4) 230 lwzu r8,8(r4) 231 stw r7,4(r6) 232 stwu r8,8(r6) 233 bdnz 1b 234 andi. r5,r5,7 2352: cmplwi 0,r5,4 236 blt 3f 237 lwzu r0,4(r4) 238 addi r5,r5,-4 239 stwu r0,4(r6) 2403: cmpwi 0,r5,0 241 beqlr 242 mtctr r5 243 addi r4,r4,3 244 addi r6,r6,3 2454: lbzu r0,1(r4) 246 stbu r0,1(r6) 247 bdnz 4b 248 blr 2495: subfic r0,r0,4 250 mtctr r0 2516: lbz r7,4(r4) 252 addi r4,r4,1 253 stb r7,4(r6) 254 addi r6,r6,1 255 bdnz 6b 256 subf r5,r0,r5 257 rlwinm. r7,r5,32-3,3,31 258 beq 2b 259 mtctr r7 260 b 1b 261 262_GLOBAL(backwards_memcpy) 263 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 264 add r6,r3,r5 265 add r4,r4,r5 266 beq 2f 267 andi. r0,r6,3 268 mtctr r7 269 bne 5f 2701: lwz r7,-4(r4) 271 lwzu r8,-8(r4) 272 stw r7,-4(r6) 273 stwu r8,-8(r6) 274 bdnz 1b 275 andi. r5,r5,7 2762: cmplwi 0,r5,4 277 blt 3f 278 lwzu r0,-4(r4) 279 subi r5,r5,4 280 stwu r0,-4(r6) 2813: cmpwi 0,r5,0 282 beqlr 283 mtctr r5 2844: lbzu r0,-1(r4) 285 stbu r0,-1(r6) 286 bdnz 4b 287 blr 2885: mtctr r0 2896: lbzu r7,-1(r4) 290 stbu r7,-1(r6) 291 bdnz 6b 292 subf r5,r0,r5 293 rlwinm. r7,r5,32-3,3,31 294 beq 2b 295 mtctr r7 296 b 1b 297 298_GLOBAL(__copy_tofrom_user) 299 addi r4,r4,-4 300 addi r6,r3,-4 301 neg r0,r3 302 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 303 beq 58f 304 305 cmplw 0,r5,r0 /* is this more than total to do? */ 306 blt 63f /* if not much to do */ 307 andi. r8,r0,3 /* get it word-aligned first */ 308 mtctr r8 309 beq+ 61f 31070: lbz r9,4(r4) /* do some bytes */ 31171: stb r9,4(r6) 312 addi r4,r4,1 313 addi r6,r6,1 314 bdnz 70b 31561: subf r5,r0,r5 316 srwi. r0,r0,2 317 mtctr r0 318 beq 58f 31972: lwzu r9,4(r4) /* do some words */ 32073: stwu r9,4(r6) 321 bdnz 72b 322 323 EX_TABLE(70b,100f) 324 EX_TABLE(71b,101f) 325 EX_TABLE(72b,102f) 326 EX_TABLE(73b,103f) 327 32858: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 329 clrlwi r5,r5,32-LG_CACHELINE_BYTES 330 li r11,4 331 beq 63f 332 333 /* Here we decide how far ahead to prefetch the source */ 334 li r3,4 335 cmpwi r0,1 336 li r7,0 337 ble 114f 338 li r7,1 339#if MAX_COPY_PREFETCH > 1 340 /* Heuristically, for large transfers we prefetch 341 MAX_COPY_PREFETCH cachelines ahead. For small transfers 342 we prefetch 1 cacheline ahead. */ 343 cmpwi r0,MAX_COPY_PREFETCH 344 ble 112f 345 li r7,MAX_COPY_PREFETCH 346112: mtctr r7 347111: dcbt r3,r4 348 addi r3,r3,CACHELINE_BYTES 349 bdnz 111b 350#else 351 dcbt r3,r4 352 addi r3,r3,CACHELINE_BYTES 353#endif /* MAX_COPY_PREFETCH > 1 */ 354 355114: subf r8,r7,r0 356 mr r0,r7 357 mtctr r8 358 35953: dcbt r3,r4 36054: dcbz r11,r6 361 EX_TABLE(54b,105f) 362/* the main body of the cacheline loop */ 363 COPY_16_BYTES_WITHEX(0) 364#if L1_CACHE_BYTES >= 32 365 COPY_16_BYTES_WITHEX(1) 366#if L1_CACHE_BYTES >= 64 367 COPY_16_BYTES_WITHEX(2) 368 COPY_16_BYTES_WITHEX(3) 369#if L1_CACHE_BYTES >= 128 370 COPY_16_BYTES_WITHEX(4) 371 COPY_16_BYTES_WITHEX(5) 372 COPY_16_BYTES_WITHEX(6) 373 COPY_16_BYTES_WITHEX(7) 374#endif 375#endif 376#endif 377 bdnz 53b 378 cmpwi r0,0 379 li r3,4 380 li r7,0 381 bne 114b 382 38363: srwi. r0,r5,2 384 mtctr r0 385 beq 64f 38630: lwzu r0,4(r4) 38731: stwu r0,4(r6) 388 bdnz 30b 389 39064: andi. r0,r5,3 391 mtctr r0 392 beq+ 65f 39340: lbz r0,4(r4) 39441: stb r0,4(r6) 395 addi r4,r4,1 396 addi r6,r6,1 397 bdnz 40b 39865: li r3,0 399 blr 400 401/* read fault, initial single-byte copy */ 402100: li r9,0 403 b 90f 404/* write fault, initial single-byte copy */ 405101: li r9,1 40690: subf r5,r8,r5 407 li r3,0 408 b 99f 409/* read fault, initial word copy */ 410102: li r9,0 411 b 91f 412/* write fault, initial word copy */ 413103: li r9,1 41491: li r3,2 415 b 99f 416 417/* 418 * this stuff handles faults in the cacheline loop and branches to either 419 * 104f (if in read part) or 105f (if in write part), after updating r5 420 */ 421 COPY_16_BYTES_EXCODE(0) 422#if L1_CACHE_BYTES >= 32 423 COPY_16_BYTES_EXCODE(1) 424#if L1_CACHE_BYTES >= 64 425 COPY_16_BYTES_EXCODE(2) 426 COPY_16_BYTES_EXCODE(3) 427#if L1_CACHE_BYTES >= 128 428 COPY_16_BYTES_EXCODE(4) 429 COPY_16_BYTES_EXCODE(5) 430 COPY_16_BYTES_EXCODE(6) 431 COPY_16_BYTES_EXCODE(7) 432#endif 433#endif 434#endif 435 436/* read fault in cacheline loop */ 437104: li r9,0 438 b 92f 439/* fault on dcbz (effectively a write fault) */ 440/* or write fault in cacheline loop */ 441105: li r9,1 44292: li r3,LG_CACHELINE_BYTES 443 mfctr r8 444 add r0,r0,r8 445 b 106f 446/* read fault in final word loop */ 447108: li r9,0 448 b 93f 449/* write fault in final word loop */ 450109: li r9,1 45193: andi. r5,r5,3 452 li r3,2 453 b 99f 454/* read fault in final byte loop */ 455110: li r9,0 456 b 94f 457/* write fault in final byte loop */ 458111: li r9,1 45994: li r5,0 460 li r3,0 461/* 462 * At this stage the number of bytes not copied is 463 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 464 */ 46599: mfctr r0 466106: slw r3,r0,r3 467 add. r3,r3,r5 468 beq 120f /* shouldn't happen */ 469 cmpwi 0,r9,0 470 bne 120f 471/* for a read fault, first try to continue the copy one byte at a time */ 472 mtctr r3 473130: lbz r0,4(r4) 474131: stb r0,4(r6) 475 addi r4,r4,1 476 addi r6,r6,1 477 bdnz 130b 478/* then clear out the destination: r3 bytes starting at 4(r6) */ 479132: mfctr r3 480 srwi. r0,r3,2 481 li r9,0 482 mtctr r0 483 beq 113f 484112: stwu r9,4(r6) 485 bdnz 112b 486113: andi. r0,r3,3 487 mtctr r0 488 beq 120f 489114: stb r9,4(r6) 490 addi r6,r6,1 491 bdnz 114b 492120: blr 493 494 EX_TABLE(30b,108b) 495 EX_TABLE(31b,109b) 496 EX_TABLE(40b,110b) 497 EX_TABLE(41b,111b) 498 EX_TABLE(130b,132b) 499 EX_TABLE(131b,120b) 500 EX_TABLE(112b,120b) 501 EX_TABLE(114b,120b) 502 503EXPORT_SYMBOL(__copy_tofrom_user) 504