1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15#include <asm/export.h> 16#include <asm/code-patching-asm.h> 17 18#define COPY_16_BYTES \ 19 lwz r7,4(r4); \ 20 lwz r8,8(r4); \ 21 lwz r9,12(r4); \ 22 lwzu r10,16(r4); \ 23 stw r7,4(r6); \ 24 stw r8,8(r6); \ 25 stw r9,12(r6); \ 26 stwu r10,16(r6) 27 28#define COPY_16_BYTES_WITHEX(n) \ 298 ## n ## 0: \ 30 lwz r7,4(r4); \ 318 ## n ## 1: \ 32 lwz r8,8(r4); \ 338 ## n ## 2: \ 34 lwz r9,12(r4); \ 358 ## n ## 3: \ 36 lwzu r10,16(r4); \ 378 ## n ## 4: \ 38 stw r7,4(r6); \ 398 ## n ## 5: \ 40 stw r8,8(r6); \ 418 ## n ## 6: \ 42 stw r9,12(r6); \ 438 ## n ## 7: \ 44 stwu r10,16(r6) 45 46#define COPY_16_BYTES_EXCODE(n) \ 479 ## n ## 0: \ 48 addi r5,r5,-(16 * n); \ 49 b 104f; \ 509 ## n ## 1: \ 51 addi r5,r5,-(16 * n); \ 52 b 105f; \ 53 EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \ 54 EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \ 55 EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \ 56 EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \ 57 EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \ 58 EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \ 59 EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \ 60 EX_TABLE(8 ## n ## 7b,9 ## n ## 1b) 61 62 .text 63 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 64 .stabs "copy_32.S",N_SO,0,0,0f 650: 66 67CACHELINE_BYTES = L1_CACHE_BYTES 68LG_CACHELINE_BYTES = L1_CACHE_SHIFT 69CACHELINE_MASK = (L1_CACHE_BYTES-1) 70 71_GLOBAL(memset16) 72 rlwinm. r0 ,r5, 31, 1, 31 73 addi r6, r3, -4 74 beq- 2f 75 rlwimi r4 ,r4 ,16 ,0 ,15 76 mtctr r0 771: stwu r4, 4(r6) 78 bdnz 1b 792: andi. r0, r5, 1 80 beqlr 81 sth r4, 4(r6) 82 blr 83EXPORT_SYMBOL(memset16) 84 85/* 86 * Use dcbz on the complete cache lines in the destination 87 * to set them to zero. This requires that the destination 88 * area is cacheable. -- paulus 89 * 90 * During early init, cache might not be active yet, so dcbz cannot be used. 91 * We therefore skip the optimised bloc that uses dcbz. This jump is 92 * replaced by a nop once cache is active. This is done in machine_init() 93 */ 94_GLOBAL(memset) 95 cmplwi 0,r5,4 96 blt 7f 97 98 rlwimi r4,r4,8,16,23 99 rlwimi r4,r4,16,0,15 100 101 stw r4,0(r3) 102 beqlr 103 andi. r0,r3,3 104 add r5,r0,r5 105 subf r6,r0,r3 106 cmplwi 0,r4,0 107 /* 108 * Skip optimised bloc until cache is enabled. Will be replaced 109 * by 'bne' during boot to use normal procedure if r4 is not zero 110 */ 1115: b 2f 112 patch_site 5b, patch__memset_nocache 113 114 clrlwi r7,r6,32-LG_CACHELINE_BYTES 115 add r8,r7,r5 116 srwi r9,r8,LG_CACHELINE_BYTES 117 addic. r9,r9,-1 /* total number of complete cachelines */ 118 ble 2f 119 xori r0,r7,CACHELINE_MASK & ~3 120 srwi. r0,r0,2 121 beq 3f 122 mtctr r0 1234: stwu r4,4(r6) 124 bdnz 4b 1253: mtctr r9 126 li r7,4 12710: dcbz r7,r6 128 addi r6,r6,CACHELINE_BYTES 129 bdnz 10b 130 clrlwi r5,r8,32-LG_CACHELINE_BYTES 131 addi r5,r5,4 132 1332: srwi r0,r5,2 134 mtctr r0 135 bdz 6f 1361: stwu r4,4(r6) 137 bdnz 1b 1386: andi. r5,r5,3 139 beqlr 140 mtctr r5 141 addi r6,r6,3 1428: stbu r4,1(r6) 143 bdnz 8b 144 blr 145 1467: cmpwi 0,r5,0 147 beqlr 148 mtctr r5 149 addi r6,r3,-1 1509: stbu r4,1(r6) 151 bdnz 9b 152 blr 153EXPORT_SYMBOL(memset) 154 155/* 156 * This version uses dcbz on the complete cache lines in the 157 * destination area to reduce memory traffic. This requires that 158 * the destination area is cacheable. 159 * We only use this version if the source and dest don't overlap. 160 * -- paulus. 161 * 162 * During early init, cache might not be active yet, so dcbz cannot be used. 163 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is 164 * replaced by a nop once cache is active. This is done in machine_init() 165 */ 166_GLOBAL(memmove) 167 cmplw 0,r3,r4 168 bgt backwards_memcpy 169 /* fall through */ 170 171_GLOBAL(memcpy) 1721: b generic_memcpy 173 patch_site 1b, patch__memcpy_nocache 174 175 add r7,r3,r5 /* test if the src & dst overlap */ 176 add r8,r4,r5 177 cmplw 0,r4,r7 178 cmplw 1,r3,r8 179 crand 0,0,4 /* cr0.lt &= cr1.lt */ 180 blt generic_memcpy /* if regions overlap */ 181 182 addi r4,r4,-4 183 addi r6,r3,-4 184 neg r0,r3 185 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 186 beq 58f 187 188 cmplw 0,r5,r0 /* is this more than total to do? */ 189 blt 63f /* if not much to do */ 190 andi. r8,r0,3 /* get it word-aligned first */ 191 subf r5,r0,r5 192 mtctr r8 193 beq+ 61f 19470: lbz r9,4(r4) /* do some bytes */ 195 addi r4,r4,1 196 addi r6,r6,1 197 stb r9,3(r6) 198 bdnz 70b 19961: srwi. r0,r0,2 200 mtctr r0 201 beq 58f 20272: lwzu r9,4(r4) /* do some words */ 203 stwu r9,4(r6) 204 bdnz 72b 205 20658: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 207 clrlwi r5,r5,32-LG_CACHELINE_BYTES 208 li r11,4 209 mtctr r0 210 beq 63f 21153: 212 dcbz r11,r6 213 COPY_16_BYTES 214#if L1_CACHE_BYTES >= 32 215 COPY_16_BYTES 216#if L1_CACHE_BYTES >= 64 217 COPY_16_BYTES 218 COPY_16_BYTES 219#if L1_CACHE_BYTES >= 128 220 COPY_16_BYTES 221 COPY_16_BYTES 222 COPY_16_BYTES 223 COPY_16_BYTES 224#endif 225#endif 226#endif 227 bdnz 53b 228 22963: srwi. r0,r5,2 230 mtctr r0 231 beq 64f 23230: lwzu r0,4(r4) 233 stwu r0,4(r6) 234 bdnz 30b 235 23664: andi. r0,r5,3 237 mtctr r0 238 beq+ 65f 239 addi r4,r4,3 240 addi r6,r6,3 24140: lbzu r0,1(r4) 242 stbu r0,1(r6) 243 bdnz 40b 24465: blr 245EXPORT_SYMBOL(memcpy) 246EXPORT_SYMBOL(memmove) 247 248generic_memcpy: 249 srwi. r7,r5,3 250 addi r6,r3,-4 251 addi r4,r4,-4 252 beq 2f /* if less than 8 bytes to do */ 253 andi. r0,r6,3 /* get dest word aligned */ 254 mtctr r7 255 bne 5f 2561: lwz r7,4(r4) 257 lwzu r8,8(r4) 258 stw r7,4(r6) 259 stwu r8,8(r6) 260 bdnz 1b 261 andi. r5,r5,7 2622: cmplwi 0,r5,4 263 blt 3f 264 lwzu r0,4(r4) 265 addi r5,r5,-4 266 stwu r0,4(r6) 2673: cmpwi 0,r5,0 268 beqlr 269 mtctr r5 270 addi r4,r4,3 271 addi r6,r6,3 2724: lbzu r0,1(r4) 273 stbu r0,1(r6) 274 bdnz 4b 275 blr 2765: subfic r0,r0,4 277 mtctr r0 2786: lbz r7,4(r4) 279 addi r4,r4,1 280 stb r7,4(r6) 281 addi r6,r6,1 282 bdnz 6b 283 subf r5,r0,r5 284 rlwinm. r7,r5,32-3,3,31 285 beq 2b 286 mtctr r7 287 b 1b 288 289_GLOBAL(backwards_memcpy) 290 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 291 add r6,r3,r5 292 add r4,r4,r5 293 beq 2f 294 andi. r0,r6,3 295 mtctr r7 296 bne 5f 2971: lwz r7,-4(r4) 298 lwzu r8,-8(r4) 299 stw r7,-4(r6) 300 stwu r8,-8(r6) 301 bdnz 1b 302 andi. r5,r5,7 3032: cmplwi 0,r5,4 304 blt 3f 305 lwzu r0,-4(r4) 306 subi r5,r5,4 307 stwu r0,-4(r6) 3083: cmpwi 0,r5,0 309 beqlr 310 mtctr r5 3114: lbzu r0,-1(r4) 312 stbu r0,-1(r6) 313 bdnz 4b 314 blr 3155: mtctr r0 3166: lbzu r7,-1(r4) 317 stbu r7,-1(r6) 318 bdnz 6b 319 subf r5,r0,r5 320 rlwinm. r7,r5,32-3,3,31 321 beq 2b 322 mtctr r7 323 b 1b 324 325_GLOBAL(__copy_tofrom_user) 326 addi r4,r4,-4 327 addi r6,r3,-4 328 neg r0,r3 329 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 330 beq 58f 331 332 cmplw 0,r5,r0 /* is this more than total to do? */ 333 blt 63f /* if not much to do */ 334 andi. r8,r0,3 /* get it word-aligned first */ 335 mtctr r8 336 beq+ 61f 33770: lbz r9,4(r4) /* do some bytes */ 33871: stb r9,4(r6) 339 addi r4,r4,1 340 addi r6,r6,1 341 bdnz 70b 34261: subf r5,r0,r5 343 srwi. r0,r0,2 344 mtctr r0 345 beq 58f 34672: lwzu r9,4(r4) /* do some words */ 34773: stwu r9,4(r6) 348 bdnz 72b 349 350 EX_TABLE(70b,100f) 351 EX_TABLE(71b,101f) 352 EX_TABLE(72b,102f) 353 EX_TABLE(73b,103f) 354 35558: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 356 clrlwi r5,r5,32-LG_CACHELINE_BYTES 357 li r11,4 358 beq 63f 359 360 /* Here we decide how far ahead to prefetch the source */ 361 li r3,4 362 cmpwi r0,1 363 li r7,0 364 ble 114f 365 li r7,1 366#if MAX_COPY_PREFETCH > 1 367 /* Heuristically, for large transfers we prefetch 368 MAX_COPY_PREFETCH cachelines ahead. For small transfers 369 we prefetch 1 cacheline ahead. */ 370 cmpwi r0,MAX_COPY_PREFETCH 371 ble 112f 372 li r7,MAX_COPY_PREFETCH 373112: mtctr r7 374111: dcbt r3,r4 375 addi r3,r3,CACHELINE_BYTES 376 bdnz 111b 377#else 378 dcbt r3,r4 379 addi r3,r3,CACHELINE_BYTES 380#endif /* MAX_COPY_PREFETCH > 1 */ 381 382114: subf r8,r7,r0 383 mr r0,r7 384 mtctr r8 385 38653: dcbt r3,r4 38754: dcbz r11,r6 388 EX_TABLE(54b,105f) 389/* the main body of the cacheline loop */ 390 COPY_16_BYTES_WITHEX(0) 391#if L1_CACHE_BYTES >= 32 392 COPY_16_BYTES_WITHEX(1) 393#if L1_CACHE_BYTES >= 64 394 COPY_16_BYTES_WITHEX(2) 395 COPY_16_BYTES_WITHEX(3) 396#if L1_CACHE_BYTES >= 128 397 COPY_16_BYTES_WITHEX(4) 398 COPY_16_BYTES_WITHEX(5) 399 COPY_16_BYTES_WITHEX(6) 400 COPY_16_BYTES_WITHEX(7) 401#endif 402#endif 403#endif 404 bdnz 53b 405 cmpwi r0,0 406 li r3,4 407 li r7,0 408 bne 114b 409 41063: srwi. r0,r5,2 411 mtctr r0 412 beq 64f 41330: lwzu r0,4(r4) 41431: stwu r0,4(r6) 415 bdnz 30b 416 41764: andi. r0,r5,3 418 mtctr r0 419 beq+ 65f 42040: lbz r0,4(r4) 42141: stb r0,4(r6) 422 addi r4,r4,1 423 addi r6,r6,1 424 bdnz 40b 42565: li r3,0 426 blr 427 428/* read fault, initial single-byte copy */ 429100: li r9,0 430 b 90f 431/* write fault, initial single-byte copy */ 432101: li r9,1 43390: subf r5,r8,r5 434 li r3,0 435 b 99f 436/* read fault, initial word copy */ 437102: li r9,0 438 b 91f 439/* write fault, initial word copy */ 440103: li r9,1 44191: li r3,2 442 b 99f 443 444/* 445 * this stuff handles faults in the cacheline loop and branches to either 446 * 104f (if in read part) or 105f (if in write part), after updating r5 447 */ 448 COPY_16_BYTES_EXCODE(0) 449#if L1_CACHE_BYTES >= 32 450 COPY_16_BYTES_EXCODE(1) 451#if L1_CACHE_BYTES >= 64 452 COPY_16_BYTES_EXCODE(2) 453 COPY_16_BYTES_EXCODE(3) 454#if L1_CACHE_BYTES >= 128 455 COPY_16_BYTES_EXCODE(4) 456 COPY_16_BYTES_EXCODE(5) 457 COPY_16_BYTES_EXCODE(6) 458 COPY_16_BYTES_EXCODE(7) 459#endif 460#endif 461#endif 462 463/* read fault in cacheline loop */ 464104: li r9,0 465 b 92f 466/* fault on dcbz (effectively a write fault) */ 467/* or write fault in cacheline loop */ 468105: li r9,1 46992: li r3,LG_CACHELINE_BYTES 470 mfctr r8 471 add r0,r0,r8 472 b 106f 473/* read fault in final word loop */ 474108: li r9,0 475 b 93f 476/* write fault in final word loop */ 477109: li r9,1 47893: andi. r5,r5,3 479 li r3,2 480 b 99f 481/* read fault in final byte loop */ 482110: li r9,0 483 b 94f 484/* write fault in final byte loop */ 485111: li r9,1 48694: li r5,0 487 li r3,0 488/* 489 * At this stage the number of bytes not copied is 490 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 491 */ 49299: mfctr r0 493106: slw r3,r0,r3 494 add. r3,r3,r5 495 beq 120f /* shouldn't happen */ 496 cmpwi 0,r9,0 497 bne 120f 498/* for a read fault, first try to continue the copy one byte at a time */ 499 mtctr r3 500130: lbz r0,4(r4) 501131: stb r0,4(r6) 502 addi r4,r4,1 503 addi r6,r6,1 504 bdnz 130b 505/* then clear out the destination: r3 bytes starting at 4(r6) */ 506132: mfctr r3 507120: blr 508 509 EX_TABLE(30b,108b) 510 EX_TABLE(31b,109b) 511 EX_TABLE(40b,110b) 512 EX_TABLE(41b,111b) 513 EX_TABLE(130b,132b) 514 EX_TABLE(131b,120b) 515 516EXPORT_SYMBOL(__copy_tofrom_user) 517