1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15 16#define COPY_16_BYTES \ 17 lwz r7,4(r4); \ 18 lwz r8,8(r4); \ 19 lwz r9,12(r4); \ 20 lwzu r10,16(r4); \ 21 stw r7,4(r6); \ 22 stw r8,8(r6); \ 23 stw r9,12(r6); \ 24 stwu r10,16(r6) 25 26#define COPY_16_BYTES_WITHEX(n) \ 278 ## n ## 0: \ 28 lwz r7,4(r4); \ 298 ## n ## 1: \ 30 lwz r8,8(r4); \ 318 ## n ## 2: \ 32 lwz r9,12(r4); \ 338 ## n ## 3: \ 34 lwzu r10,16(r4); \ 358 ## n ## 4: \ 36 stw r7,4(r6); \ 378 ## n ## 5: \ 38 stw r8,8(r6); \ 398 ## n ## 6: \ 40 stw r9,12(r6); \ 418 ## n ## 7: \ 42 stwu r10,16(r6) 43 44#define COPY_16_BYTES_EXCODE(n) \ 459 ## n ## 0: \ 46 addi r5,r5,-(16 * n); \ 47 b 104f; \ 489 ## n ## 1: \ 49 addi r5,r5,-(16 * n); \ 50 b 105f; \ 51.section __ex_table,"a"; \ 52 .align 2; \ 53 .long 8 ## n ## 0b,9 ## n ## 0b; \ 54 .long 8 ## n ## 1b,9 ## n ## 0b; \ 55 .long 8 ## n ## 2b,9 ## n ## 0b; \ 56 .long 8 ## n ## 3b,9 ## n ## 0b; \ 57 .long 8 ## n ## 4b,9 ## n ## 1b; \ 58 .long 8 ## n ## 5b,9 ## n ## 1b; \ 59 .long 8 ## n ## 6b,9 ## n ## 1b; \ 60 .long 8 ## n ## 7b,9 ## n ## 1b; \ 61 .text 62 63 .text 64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 65 .stabs "copy_32.S",N_SO,0,0,0f 660: 67 68CACHELINE_BYTES = L1_CACHE_BYTES 69LG_CACHELINE_BYTES = L1_CACHE_SHIFT 70CACHELINE_MASK = (L1_CACHE_BYTES-1) 71 72/* 73 * Use dcbz on the complete cache lines in the destination 74 * to set them to zero. This requires that the destination 75 * area is cacheable. -- paulus 76 * 77 * During early init, cache might not be active yet, so dcbz cannot be used. 78 * We therefore skip the optimised bloc that uses dcbz. This jump is 79 * replaced by a nop once cache is active. This is done in machine_init() 80 */ 81_GLOBAL(memset) 82 rlwimi r4,r4,8,16,23 83 rlwimi r4,r4,16,0,15 84 85 addi r6,r3,-4 86 cmplwi 0,r5,4 87 blt 7f 88 stwu r4,4(r6) 89 beqlr 90 andi. r0,r6,3 91 add r5,r0,r5 92 subf r6,r0,r6 93 cmplwi 0,r4,0 94 bne 2f /* Use normal procedure if r4 is not zero */ 95_GLOBAL(memset_nocache_branch) 96 b 2f /* Skip optimised bloc until cache is enabled */ 97 98 clrlwi r7,r6,32-LG_CACHELINE_BYTES 99 add r8,r7,r5 100 srwi r9,r8,LG_CACHELINE_BYTES 101 addic. r9,r9,-1 /* total number of complete cachelines */ 102 ble 2f 103 xori r0,r7,CACHELINE_MASK & ~3 104 srwi. r0,r0,2 105 beq 3f 106 mtctr r0 1074: stwu r4,4(r6) 108 bdnz 4b 1093: mtctr r9 110 li r7,4 11110: dcbz r7,r6 112 addi r6,r6,CACHELINE_BYTES 113 bdnz 10b 114 clrlwi r5,r8,32-LG_CACHELINE_BYTES 115 addi r5,r5,4 116 1172: srwi r0,r5,2 118 mtctr r0 119 bdz 6f 1201: stwu r4,4(r6) 121 bdnz 1b 1226: andi. r5,r5,3 1237: cmpwi 0,r5,0 124 beqlr 125 mtctr r5 126 addi r6,r6,3 1278: stbu r4,1(r6) 128 bdnz 8b 129 blr 130 131/* 132 * This version uses dcbz on the complete cache lines in the 133 * destination area to reduce memory traffic. This requires that 134 * the destination area is cacheable. 135 * We only use this version if the source and dest don't overlap. 136 * -- paulus. 137 * 138 * During early init, cache might not be active yet, so dcbz cannot be used. 139 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is 140 * replaced by a nop once cache is active. This is done in machine_init() 141 */ 142_GLOBAL(memmove) 143 cmplw 0,r3,r4 144 bgt backwards_memcpy 145 /* fall through */ 146 147_GLOBAL(memcpy) 148 b generic_memcpy 149 add r7,r3,r5 /* test if the src & dst overlap */ 150 add r8,r4,r5 151 cmplw 0,r4,r7 152 cmplw 1,r3,r8 153 crand 0,0,4 /* cr0.lt &= cr1.lt */ 154 blt generic_memcpy /* if regions overlap */ 155 156 addi r4,r4,-4 157 addi r6,r3,-4 158 neg r0,r3 159 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 160 beq 58f 161 162 cmplw 0,r5,r0 /* is this more than total to do? */ 163 blt 63f /* if not much to do */ 164 andi. r8,r0,3 /* get it word-aligned first */ 165 subf r5,r0,r5 166 mtctr r8 167 beq+ 61f 16870: lbz r9,4(r4) /* do some bytes */ 169 addi r4,r4,1 170 addi r6,r6,1 171 stb r9,3(r6) 172 bdnz 70b 17361: srwi. r0,r0,2 174 mtctr r0 175 beq 58f 17672: lwzu r9,4(r4) /* do some words */ 177 stwu r9,4(r6) 178 bdnz 72b 179 18058: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 181 clrlwi r5,r5,32-LG_CACHELINE_BYTES 182 li r11,4 183 mtctr r0 184 beq 63f 18553: 186 dcbz r11,r6 187 COPY_16_BYTES 188#if L1_CACHE_BYTES >= 32 189 COPY_16_BYTES 190#if L1_CACHE_BYTES >= 64 191 COPY_16_BYTES 192 COPY_16_BYTES 193#if L1_CACHE_BYTES >= 128 194 COPY_16_BYTES 195 COPY_16_BYTES 196 COPY_16_BYTES 197 COPY_16_BYTES 198#endif 199#endif 200#endif 201 bdnz 53b 202 20363: srwi. r0,r5,2 204 mtctr r0 205 beq 64f 20630: lwzu r0,4(r4) 207 stwu r0,4(r6) 208 bdnz 30b 209 21064: andi. r0,r5,3 211 mtctr r0 212 beq+ 65f 213 addi r4,r4,3 214 addi r6,r6,3 21540: lbzu r0,1(r4) 216 stbu r0,1(r6) 217 bdnz 40b 21865: blr 219 220_GLOBAL(generic_memcpy) 221 srwi. r7,r5,3 222 addi r6,r3,-4 223 addi r4,r4,-4 224 beq 2f /* if less than 8 bytes to do */ 225 andi. r0,r6,3 /* get dest word aligned */ 226 mtctr r7 227 bne 5f 2281: lwz r7,4(r4) 229 lwzu r8,8(r4) 230 stw r7,4(r6) 231 stwu r8,8(r6) 232 bdnz 1b 233 andi. r5,r5,7 2342: cmplwi 0,r5,4 235 blt 3f 236 lwzu r0,4(r4) 237 addi r5,r5,-4 238 stwu r0,4(r6) 2393: cmpwi 0,r5,0 240 beqlr 241 mtctr r5 242 addi r4,r4,3 243 addi r6,r6,3 2444: lbzu r0,1(r4) 245 stbu r0,1(r6) 246 bdnz 4b 247 blr 2485: subfic r0,r0,4 249 mtctr r0 2506: lbz r7,4(r4) 251 addi r4,r4,1 252 stb r7,4(r6) 253 addi r6,r6,1 254 bdnz 6b 255 subf r5,r0,r5 256 rlwinm. r7,r5,32-3,3,31 257 beq 2b 258 mtctr r7 259 b 1b 260 261_GLOBAL(backwards_memcpy) 262 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 263 add r6,r3,r5 264 add r4,r4,r5 265 beq 2f 266 andi. r0,r6,3 267 mtctr r7 268 bne 5f 2691: lwz r7,-4(r4) 270 lwzu r8,-8(r4) 271 stw r7,-4(r6) 272 stwu r8,-8(r6) 273 bdnz 1b 274 andi. r5,r5,7 2752: cmplwi 0,r5,4 276 blt 3f 277 lwzu r0,-4(r4) 278 subi r5,r5,4 279 stwu r0,-4(r6) 2803: cmpwi 0,r5,0 281 beqlr 282 mtctr r5 2834: lbzu r0,-1(r4) 284 stbu r0,-1(r6) 285 bdnz 4b 286 blr 2875: mtctr r0 2886: lbzu r7,-1(r4) 289 stbu r7,-1(r6) 290 bdnz 6b 291 subf r5,r0,r5 292 rlwinm. r7,r5,32-3,3,31 293 beq 2b 294 mtctr r7 295 b 1b 296 297_GLOBAL(__copy_tofrom_user) 298 addi r4,r4,-4 299 addi r6,r3,-4 300 neg r0,r3 301 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 302 beq 58f 303 304 cmplw 0,r5,r0 /* is this more than total to do? */ 305 blt 63f /* if not much to do */ 306 andi. r8,r0,3 /* get it word-aligned first */ 307 mtctr r8 308 beq+ 61f 30970: lbz r9,4(r4) /* do some bytes */ 31071: stb r9,4(r6) 311 addi r4,r4,1 312 addi r6,r6,1 313 bdnz 70b 31461: subf r5,r0,r5 315 srwi. r0,r0,2 316 mtctr r0 317 beq 58f 31872: lwzu r9,4(r4) /* do some words */ 31973: stwu r9,4(r6) 320 bdnz 72b 321 322 .section __ex_table,"a" 323 .align 2 324 .long 70b,100f 325 .long 71b,101f 326 .long 72b,102f 327 .long 73b,103f 328 .text 329 33058: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 331 clrlwi r5,r5,32-LG_CACHELINE_BYTES 332 li r11,4 333 beq 63f 334 335 /* Here we decide how far ahead to prefetch the source */ 336 li r3,4 337 cmpwi r0,1 338 li r7,0 339 ble 114f 340 li r7,1 341#if MAX_COPY_PREFETCH > 1 342 /* Heuristically, for large transfers we prefetch 343 MAX_COPY_PREFETCH cachelines ahead. For small transfers 344 we prefetch 1 cacheline ahead. */ 345 cmpwi r0,MAX_COPY_PREFETCH 346 ble 112f 347 li r7,MAX_COPY_PREFETCH 348112: mtctr r7 349111: dcbt r3,r4 350 addi r3,r3,CACHELINE_BYTES 351 bdnz 111b 352#else 353 dcbt r3,r4 354 addi r3,r3,CACHELINE_BYTES 355#endif /* MAX_COPY_PREFETCH > 1 */ 356 357114: subf r8,r7,r0 358 mr r0,r7 359 mtctr r8 360 36153: dcbt r3,r4 36254: dcbz r11,r6 363 .section __ex_table,"a" 364 .align 2 365 .long 54b,105f 366 .text 367/* the main body of the cacheline loop */ 368 COPY_16_BYTES_WITHEX(0) 369#if L1_CACHE_BYTES >= 32 370 COPY_16_BYTES_WITHEX(1) 371#if L1_CACHE_BYTES >= 64 372 COPY_16_BYTES_WITHEX(2) 373 COPY_16_BYTES_WITHEX(3) 374#if L1_CACHE_BYTES >= 128 375 COPY_16_BYTES_WITHEX(4) 376 COPY_16_BYTES_WITHEX(5) 377 COPY_16_BYTES_WITHEX(6) 378 COPY_16_BYTES_WITHEX(7) 379#endif 380#endif 381#endif 382 bdnz 53b 383 cmpwi r0,0 384 li r3,4 385 li r7,0 386 bne 114b 387 38863: srwi. r0,r5,2 389 mtctr r0 390 beq 64f 39130: lwzu r0,4(r4) 39231: stwu r0,4(r6) 393 bdnz 30b 394 39564: andi. r0,r5,3 396 mtctr r0 397 beq+ 65f 39840: lbz r0,4(r4) 39941: stb r0,4(r6) 400 addi r4,r4,1 401 addi r6,r6,1 402 bdnz 40b 40365: li r3,0 404 blr 405 406/* read fault, initial single-byte copy */ 407100: li r9,0 408 b 90f 409/* write fault, initial single-byte copy */ 410101: li r9,1 41190: subf r5,r8,r5 412 li r3,0 413 b 99f 414/* read fault, initial word copy */ 415102: li r9,0 416 b 91f 417/* write fault, initial word copy */ 418103: li r9,1 41991: li r3,2 420 b 99f 421 422/* 423 * this stuff handles faults in the cacheline loop and branches to either 424 * 104f (if in read part) or 105f (if in write part), after updating r5 425 */ 426 COPY_16_BYTES_EXCODE(0) 427#if L1_CACHE_BYTES >= 32 428 COPY_16_BYTES_EXCODE(1) 429#if L1_CACHE_BYTES >= 64 430 COPY_16_BYTES_EXCODE(2) 431 COPY_16_BYTES_EXCODE(3) 432#if L1_CACHE_BYTES >= 128 433 COPY_16_BYTES_EXCODE(4) 434 COPY_16_BYTES_EXCODE(5) 435 COPY_16_BYTES_EXCODE(6) 436 COPY_16_BYTES_EXCODE(7) 437#endif 438#endif 439#endif 440 441/* read fault in cacheline loop */ 442104: li r9,0 443 b 92f 444/* fault on dcbz (effectively a write fault) */ 445/* or write fault in cacheline loop */ 446105: li r9,1 44792: li r3,LG_CACHELINE_BYTES 448 mfctr r8 449 add r0,r0,r8 450 b 106f 451/* read fault in final word loop */ 452108: li r9,0 453 b 93f 454/* write fault in final word loop */ 455109: li r9,1 45693: andi. r5,r5,3 457 li r3,2 458 b 99f 459/* read fault in final byte loop */ 460110: li r9,0 461 b 94f 462/* write fault in final byte loop */ 463111: li r9,1 46494: li r5,0 465 li r3,0 466/* 467 * At this stage the number of bytes not copied is 468 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 469 */ 47099: mfctr r0 471106: slw r3,r0,r3 472 add. r3,r3,r5 473 beq 120f /* shouldn't happen */ 474 cmpwi 0,r9,0 475 bne 120f 476/* for a read fault, first try to continue the copy one byte at a time */ 477 mtctr r3 478130: lbz r0,4(r4) 479131: stb r0,4(r6) 480 addi r4,r4,1 481 addi r6,r6,1 482 bdnz 130b 483/* then clear out the destination: r3 bytes starting at 4(r6) */ 484132: mfctr r3 485 srwi. r0,r3,2 486 li r9,0 487 mtctr r0 488 beq 113f 489112: stwu r9,4(r6) 490 bdnz 112b 491113: andi. r0,r3,3 492 mtctr r0 493 beq 120f 494114: stb r9,4(r6) 495 addi r6,r6,1 496 bdnz 114b 497120: blr 498 499 .section __ex_table,"a" 500 .align 2 501 .long 30b,108b 502 .long 31b,109b 503 .long 40b,110b 504 .long 41b,111b 505 .long 130b,132b 506 .long 131b,120b 507 .long 112b,120b 508 .long 114b,120b 509 .text 510