1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15#include <asm/export.h> 16 17#define COPY_16_BYTES \ 18 lwz r7,4(r4); \ 19 lwz r8,8(r4); \ 20 lwz r9,12(r4); \ 21 lwzu r10,16(r4); \ 22 stw r7,4(r6); \ 23 stw r8,8(r6); \ 24 stw r9,12(r6); \ 25 stwu r10,16(r6) 26 27#define COPY_16_BYTES_WITHEX(n) \ 288 ## n ## 0: \ 29 lwz r7,4(r4); \ 308 ## n ## 1: \ 31 lwz r8,8(r4); \ 328 ## n ## 2: \ 33 lwz r9,12(r4); \ 348 ## n ## 3: \ 35 lwzu r10,16(r4); \ 368 ## n ## 4: \ 37 stw r7,4(r6); \ 388 ## n ## 5: \ 39 stw r8,8(r6); \ 408 ## n ## 6: \ 41 stw r9,12(r6); \ 428 ## n ## 7: \ 43 stwu r10,16(r6) 44 45#define COPY_16_BYTES_EXCODE(n) \ 469 ## n ## 0: \ 47 addi r5,r5,-(16 * n); \ 48 b 104f; \ 499 ## n ## 1: \ 50 addi r5,r5,-(16 * n); \ 51 b 105f; \ 52 EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \ 53 EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \ 54 EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \ 55 EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \ 56 EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \ 57 EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \ 58 EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \ 59 EX_TABLE(8 ## n ## 7b,9 ## n ## 1b) 60 61 .text 62 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 63 .stabs "copy_32.S",N_SO,0,0,0f 640: 65 66CACHELINE_BYTES = L1_CACHE_BYTES 67LG_CACHELINE_BYTES = L1_CACHE_SHIFT 68CACHELINE_MASK = (L1_CACHE_BYTES-1) 69 70_GLOBAL(memset16) 71 rlwinm. r0 ,r5, 31, 1, 31 72 addi r6, r3, -4 73 beq- 2f 74 rlwimi r4 ,r4 ,16 ,0 ,15 75 mtctr r0 761: stwu r4, 4(r6) 77 bdnz 1b 782: andi. r0, r5, 1 79 beqlr 80 sth r4, 4(r6) 81 blr 82EXPORT_SYMBOL(memset16) 83 84/* 85 * Use dcbz on the complete cache lines in the destination 86 * to set them to zero. This requires that the destination 87 * area is cacheable. -- paulus 88 * 89 * During early init, cache might not be active yet, so dcbz cannot be used. 90 * We therefore skip the optimised bloc that uses dcbz. This jump is 91 * replaced by a nop once cache is active. This is done in machine_init() 92 */ 93_GLOBAL(memset) 94 cmplwi 0,r5,4 95 blt 7f 96 97 rlwimi r4,r4,8,16,23 98 rlwimi r4,r4,16,0,15 99 100 stw r4,0(r3) 101 beqlr 102 andi. r0,r3,3 103 add r5,r0,r5 104 subf r6,r0,r3 105 cmplwi 0,r4,0 106 /* 107 * Skip optimised bloc until cache is enabled. Will be replaced 108 * by 'bne' during boot to use normal procedure if r4 is not zero 109 */ 110_GLOBAL(memset_nocache_branch) 111 b 2f 112 113 clrlwi r7,r6,32-LG_CACHELINE_BYTES 114 add r8,r7,r5 115 srwi r9,r8,LG_CACHELINE_BYTES 116 addic. r9,r9,-1 /* total number of complete cachelines */ 117 ble 2f 118 xori r0,r7,CACHELINE_MASK & ~3 119 srwi. r0,r0,2 120 beq 3f 121 mtctr r0 1224: stwu r4,4(r6) 123 bdnz 4b 1243: mtctr r9 125 li r7,4 12610: dcbz r7,r6 127 addi r6,r6,CACHELINE_BYTES 128 bdnz 10b 129 clrlwi r5,r8,32-LG_CACHELINE_BYTES 130 addi r5,r5,4 131 1322: srwi r0,r5,2 133 mtctr r0 134 bdz 6f 1351: stwu r4,4(r6) 136 bdnz 1b 1376: andi. r5,r5,3 138 beqlr 139 mtctr r5 140 addi r6,r6,3 1418: stbu r4,1(r6) 142 bdnz 8b 143 blr 144 1457: cmpwi 0,r5,0 146 beqlr 147 mtctr r5 148 addi r6,r3,-1 1499: stbu r4,1(r6) 150 bdnz 9b 151 blr 152EXPORT_SYMBOL(memset) 153 154/* 155 * This version uses dcbz on the complete cache lines in the 156 * destination area to reduce memory traffic. This requires that 157 * the destination area is cacheable. 158 * We only use this version if the source and dest don't overlap. 159 * -- paulus. 160 * 161 * During early init, cache might not be active yet, so dcbz cannot be used. 162 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is 163 * replaced by a nop once cache is active. This is done in machine_init() 164 */ 165_GLOBAL(memmove) 166 cmplw 0,r3,r4 167 bgt backwards_memcpy 168 /* fall through */ 169 170_GLOBAL(memcpy) 171 b generic_memcpy 172 add r7,r3,r5 /* test if the src & dst overlap */ 173 add r8,r4,r5 174 cmplw 0,r4,r7 175 cmplw 1,r3,r8 176 crand 0,0,4 /* cr0.lt &= cr1.lt */ 177 blt generic_memcpy /* if regions overlap */ 178 179 addi r4,r4,-4 180 addi r6,r3,-4 181 neg r0,r3 182 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 183 beq 58f 184 185 cmplw 0,r5,r0 /* is this more than total to do? */ 186 blt 63f /* if not much to do */ 187 andi. r8,r0,3 /* get it word-aligned first */ 188 subf r5,r0,r5 189 mtctr r8 190 beq+ 61f 19170: lbz r9,4(r4) /* do some bytes */ 192 addi r4,r4,1 193 addi r6,r6,1 194 stb r9,3(r6) 195 bdnz 70b 19661: srwi. r0,r0,2 197 mtctr r0 198 beq 58f 19972: lwzu r9,4(r4) /* do some words */ 200 stwu r9,4(r6) 201 bdnz 72b 202 20358: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 204 clrlwi r5,r5,32-LG_CACHELINE_BYTES 205 li r11,4 206 mtctr r0 207 beq 63f 20853: 209 dcbz r11,r6 210 COPY_16_BYTES 211#if L1_CACHE_BYTES >= 32 212 COPY_16_BYTES 213#if L1_CACHE_BYTES >= 64 214 COPY_16_BYTES 215 COPY_16_BYTES 216#if L1_CACHE_BYTES >= 128 217 COPY_16_BYTES 218 COPY_16_BYTES 219 COPY_16_BYTES 220 COPY_16_BYTES 221#endif 222#endif 223#endif 224 bdnz 53b 225 22663: srwi. r0,r5,2 227 mtctr r0 228 beq 64f 22930: lwzu r0,4(r4) 230 stwu r0,4(r6) 231 bdnz 30b 232 23364: andi. r0,r5,3 234 mtctr r0 235 beq+ 65f 236 addi r4,r4,3 237 addi r6,r6,3 23840: lbzu r0,1(r4) 239 stbu r0,1(r6) 240 bdnz 40b 24165: blr 242EXPORT_SYMBOL(memcpy) 243EXPORT_SYMBOL(memmove) 244 245generic_memcpy: 246 srwi. r7,r5,3 247 addi r6,r3,-4 248 addi r4,r4,-4 249 beq 2f /* if less than 8 bytes to do */ 250 andi. r0,r6,3 /* get dest word aligned */ 251 mtctr r7 252 bne 5f 2531: lwz r7,4(r4) 254 lwzu r8,8(r4) 255 stw r7,4(r6) 256 stwu r8,8(r6) 257 bdnz 1b 258 andi. r5,r5,7 2592: cmplwi 0,r5,4 260 blt 3f 261 lwzu r0,4(r4) 262 addi r5,r5,-4 263 stwu r0,4(r6) 2643: cmpwi 0,r5,0 265 beqlr 266 mtctr r5 267 addi r4,r4,3 268 addi r6,r6,3 2694: lbzu r0,1(r4) 270 stbu r0,1(r6) 271 bdnz 4b 272 blr 2735: subfic r0,r0,4 274 mtctr r0 2756: lbz r7,4(r4) 276 addi r4,r4,1 277 stb r7,4(r6) 278 addi r6,r6,1 279 bdnz 6b 280 subf r5,r0,r5 281 rlwinm. r7,r5,32-3,3,31 282 beq 2b 283 mtctr r7 284 b 1b 285 286_GLOBAL(backwards_memcpy) 287 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 288 add r6,r3,r5 289 add r4,r4,r5 290 beq 2f 291 andi. r0,r6,3 292 mtctr r7 293 bne 5f 2941: lwz r7,-4(r4) 295 lwzu r8,-8(r4) 296 stw r7,-4(r6) 297 stwu r8,-8(r6) 298 bdnz 1b 299 andi. r5,r5,7 3002: cmplwi 0,r5,4 301 blt 3f 302 lwzu r0,-4(r4) 303 subi r5,r5,4 304 stwu r0,-4(r6) 3053: cmpwi 0,r5,0 306 beqlr 307 mtctr r5 3084: lbzu r0,-1(r4) 309 stbu r0,-1(r6) 310 bdnz 4b 311 blr 3125: mtctr r0 3136: lbzu r7,-1(r4) 314 stbu r7,-1(r6) 315 bdnz 6b 316 subf r5,r0,r5 317 rlwinm. r7,r5,32-3,3,31 318 beq 2b 319 mtctr r7 320 b 1b 321 322_GLOBAL(__copy_tofrom_user) 323 addi r4,r4,-4 324 addi r6,r3,-4 325 neg r0,r3 326 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 327 beq 58f 328 329 cmplw 0,r5,r0 /* is this more than total to do? */ 330 blt 63f /* if not much to do */ 331 andi. r8,r0,3 /* get it word-aligned first */ 332 mtctr r8 333 beq+ 61f 33470: lbz r9,4(r4) /* do some bytes */ 33571: stb r9,4(r6) 336 addi r4,r4,1 337 addi r6,r6,1 338 bdnz 70b 33961: subf r5,r0,r5 340 srwi. r0,r0,2 341 mtctr r0 342 beq 58f 34372: lwzu r9,4(r4) /* do some words */ 34473: stwu r9,4(r6) 345 bdnz 72b 346 347 EX_TABLE(70b,100f) 348 EX_TABLE(71b,101f) 349 EX_TABLE(72b,102f) 350 EX_TABLE(73b,103f) 351 35258: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 353 clrlwi r5,r5,32-LG_CACHELINE_BYTES 354 li r11,4 355 beq 63f 356 357 /* Here we decide how far ahead to prefetch the source */ 358 li r3,4 359 cmpwi r0,1 360 li r7,0 361 ble 114f 362 li r7,1 363#if MAX_COPY_PREFETCH > 1 364 /* Heuristically, for large transfers we prefetch 365 MAX_COPY_PREFETCH cachelines ahead. For small transfers 366 we prefetch 1 cacheline ahead. */ 367 cmpwi r0,MAX_COPY_PREFETCH 368 ble 112f 369 li r7,MAX_COPY_PREFETCH 370112: mtctr r7 371111: dcbt r3,r4 372 addi r3,r3,CACHELINE_BYTES 373 bdnz 111b 374#else 375 dcbt r3,r4 376 addi r3,r3,CACHELINE_BYTES 377#endif /* MAX_COPY_PREFETCH > 1 */ 378 379114: subf r8,r7,r0 380 mr r0,r7 381 mtctr r8 382 38353: dcbt r3,r4 38454: dcbz r11,r6 385 EX_TABLE(54b,105f) 386/* the main body of the cacheline loop */ 387 COPY_16_BYTES_WITHEX(0) 388#if L1_CACHE_BYTES >= 32 389 COPY_16_BYTES_WITHEX(1) 390#if L1_CACHE_BYTES >= 64 391 COPY_16_BYTES_WITHEX(2) 392 COPY_16_BYTES_WITHEX(3) 393#if L1_CACHE_BYTES >= 128 394 COPY_16_BYTES_WITHEX(4) 395 COPY_16_BYTES_WITHEX(5) 396 COPY_16_BYTES_WITHEX(6) 397 COPY_16_BYTES_WITHEX(7) 398#endif 399#endif 400#endif 401 bdnz 53b 402 cmpwi r0,0 403 li r3,4 404 li r7,0 405 bne 114b 406 40763: srwi. r0,r5,2 408 mtctr r0 409 beq 64f 41030: lwzu r0,4(r4) 41131: stwu r0,4(r6) 412 bdnz 30b 413 41464: andi. r0,r5,3 415 mtctr r0 416 beq+ 65f 41740: lbz r0,4(r4) 41841: stb r0,4(r6) 419 addi r4,r4,1 420 addi r6,r6,1 421 bdnz 40b 42265: li r3,0 423 blr 424 425/* read fault, initial single-byte copy */ 426100: li r9,0 427 b 90f 428/* write fault, initial single-byte copy */ 429101: li r9,1 43090: subf r5,r8,r5 431 li r3,0 432 b 99f 433/* read fault, initial word copy */ 434102: li r9,0 435 b 91f 436/* write fault, initial word copy */ 437103: li r9,1 43891: li r3,2 439 b 99f 440 441/* 442 * this stuff handles faults in the cacheline loop and branches to either 443 * 104f (if in read part) or 105f (if in write part), after updating r5 444 */ 445 COPY_16_BYTES_EXCODE(0) 446#if L1_CACHE_BYTES >= 32 447 COPY_16_BYTES_EXCODE(1) 448#if L1_CACHE_BYTES >= 64 449 COPY_16_BYTES_EXCODE(2) 450 COPY_16_BYTES_EXCODE(3) 451#if L1_CACHE_BYTES >= 128 452 COPY_16_BYTES_EXCODE(4) 453 COPY_16_BYTES_EXCODE(5) 454 COPY_16_BYTES_EXCODE(6) 455 COPY_16_BYTES_EXCODE(7) 456#endif 457#endif 458#endif 459 460/* read fault in cacheline loop */ 461104: li r9,0 462 b 92f 463/* fault on dcbz (effectively a write fault) */ 464/* or write fault in cacheline loop */ 465105: li r9,1 46692: li r3,LG_CACHELINE_BYTES 467 mfctr r8 468 add r0,r0,r8 469 b 106f 470/* read fault in final word loop */ 471108: li r9,0 472 b 93f 473/* write fault in final word loop */ 474109: li r9,1 47593: andi. r5,r5,3 476 li r3,2 477 b 99f 478/* read fault in final byte loop */ 479110: li r9,0 480 b 94f 481/* write fault in final byte loop */ 482111: li r9,1 48394: li r5,0 484 li r3,0 485/* 486 * At this stage the number of bytes not copied is 487 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 488 */ 48999: mfctr r0 490106: slw r3,r0,r3 491 add. r3,r3,r5 492 beq 120f /* shouldn't happen */ 493 cmpwi 0,r9,0 494 bne 120f 495/* for a read fault, first try to continue the copy one byte at a time */ 496 mtctr r3 497130: lbz r0,4(r4) 498131: stb r0,4(r6) 499 addi r4,r4,1 500 addi r6,r6,1 501 bdnz 130b 502/* then clear out the destination: r3 bytes starting at 4(r6) */ 503132: mfctr r3 504120: blr 505 506 EX_TABLE(30b,108b) 507 EX_TABLE(31b,109b) 508 EX_TABLE(40b,110b) 509 EX_TABLE(41b,111b) 510 EX_TABLE(130b,132b) 511 EX_TABLE(131b,120b) 512 513EXPORT_SYMBOL(__copy_tofrom_user) 514