1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15#include <asm/export.h> 16#include <asm/code-patching-asm.h> 17#include <asm/kasan.h> 18 19#define COPY_16_BYTES \ 20 lwz r7,4(r4); \ 21 lwz r8,8(r4); \ 22 lwz r9,12(r4); \ 23 lwzu r10,16(r4); \ 24 stw r7,4(r6); \ 25 stw r8,8(r6); \ 26 stw r9,12(r6); \ 27 stwu r10,16(r6) 28 29#define COPY_16_BYTES_WITHEX(n) \ 308 ## n ## 0: \ 31 lwz r7,4(r4); \ 328 ## n ## 1: \ 33 lwz r8,8(r4); \ 348 ## n ## 2: \ 35 lwz r9,12(r4); \ 368 ## n ## 3: \ 37 lwzu r10,16(r4); \ 388 ## n ## 4: \ 39 stw r7,4(r6); \ 408 ## n ## 5: \ 41 stw r8,8(r6); \ 428 ## n ## 6: \ 43 stw r9,12(r6); \ 448 ## n ## 7: \ 45 stwu r10,16(r6) 46 47#define COPY_16_BYTES_EXCODE(n) \ 489 ## n ## 0: \ 49 addi r5,r5,-(16 * n); \ 50 b 104f; \ 519 ## n ## 1: \ 52 addi r5,r5,-(16 * n); \ 53 b 105f; \ 54 EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \ 55 EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \ 56 EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \ 57 EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \ 58 EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \ 59 EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \ 60 EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \ 61 EX_TABLE(8 ## n ## 7b,9 ## n ## 1b) 62 63 .text 64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 65 .stabs "copy_32.S",N_SO,0,0,0f 660: 67 68CACHELINE_BYTES = L1_CACHE_BYTES 69LG_CACHELINE_BYTES = L1_CACHE_SHIFT 70CACHELINE_MASK = (L1_CACHE_BYTES-1) 71 72#ifndef CONFIG_KASAN 73_GLOBAL(memset16) 74 rlwinm. r0 ,r5, 31, 1, 31 75 addi r6, r3, -4 76 beq- 2f 77 rlwimi r4 ,r4 ,16 ,0 ,15 78 mtctr r0 791: stwu r4, 4(r6) 80 bdnz 1b 812: andi. r0, r5, 1 82 beqlr 83 sth r4, 4(r6) 84 blr 85EXPORT_SYMBOL(memset16) 86#endif 87 88/* 89 * Use dcbz on the complete cache lines in the destination 90 * to set them to zero. This requires that the destination 91 * area is cacheable. -- paulus 92 * 93 * During early init, cache might not be active yet, so dcbz cannot be used. 94 * We therefore skip the optimised bloc that uses dcbz. This jump is 95 * replaced by a nop once cache is active. This is done in machine_init() 96 */ 97_GLOBAL_KASAN(memset) 98 cmplwi 0,r5,4 99 blt 7f 100 101 rlwimi r4,r4,8,16,23 102 rlwimi r4,r4,16,0,15 103 104 stw r4,0(r3) 105 beqlr 106 andi. r0,r3,3 107 add r5,r0,r5 108 subf r6,r0,r3 109 cmplwi 0,r4,0 110 /* 111 * Skip optimised bloc until cache is enabled. Will be replaced 112 * by 'bne' during boot to use normal procedure if r4 is not zero 113 */ 1145: b 2f 115 patch_site 5b, patch__memset_nocache 116 117 clrlwi r7,r6,32-LG_CACHELINE_BYTES 118 add r8,r7,r5 119 srwi r9,r8,LG_CACHELINE_BYTES 120 addic. r9,r9,-1 /* total number of complete cachelines */ 121 ble 2f 122 xori r0,r7,CACHELINE_MASK & ~3 123 srwi. r0,r0,2 124 beq 3f 125 mtctr r0 1264: stwu r4,4(r6) 127 bdnz 4b 1283: mtctr r9 129 li r7,4 13010: dcbz r7,r6 131 addi r6,r6,CACHELINE_BYTES 132 bdnz 10b 133 clrlwi r5,r8,32-LG_CACHELINE_BYTES 134 addi r5,r5,4 135 1362: srwi r0,r5,2 137 mtctr r0 138 bdz 6f 1391: stwu r4,4(r6) 140 bdnz 1b 1416: andi. r5,r5,3 142 beqlr 143 mtctr r5 144 addi r6,r6,3 1458: stbu r4,1(r6) 146 bdnz 8b 147 blr 148 1497: cmpwi 0,r5,0 150 beqlr 151 mtctr r5 152 addi r6,r3,-1 1539: stbu r4,1(r6) 154 bdnz 9b 155 blr 156EXPORT_SYMBOL(memset) 157EXPORT_SYMBOL_KASAN(memset) 158 159/* 160 * This version uses dcbz on the complete cache lines in the 161 * destination area to reduce memory traffic. This requires that 162 * the destination area is cacheable. 163 * We only use this version if the source and dest don't overlap. 164 * -- paulus. 165 * 166 * During early init, cache might not be active yet, so dcbz cannot be used. 167 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is 168 * replaced by a nop once cache is active. This is done in machine_init() 169 */ 170_GLOBAL_KASAN(memmove) 171 cmplw 0,r3,r4 172 bgt backwards_memcpy 173 /* fall through */ 174 175_GLOBAL_KASAN(memcpy) 1761: b generic_memcpy 177 patch_site 1b, patch__memcpy_nocache 178 179 add r7,r3,r5 /* test if the src & dst overlap */ 180 add r8,r4,r5 181 cmplw 0,r4,r7 182 cmplw 1,r3,r8 183 crand 0,0,4 /* cr0.lt &= cr1.lt */ 184 blt generic_memcpy /* if regions overlap */ 185 186 addi r4,r4,-4 187 addi r6,r3,-4 188 neg r0,r3 189 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 190 beq 58f 191 192 cmplw 0,r5,r0 /* is this more than total to do? */ 193 blt 63f /* if not much to do */ 194 andi. r8,r0,3 /* get it word-aligned first */ 195 subf r5,r0,r5 196 mtctr r8 197 beq+ 61f 19870: lbz r9,4(r4) /* do some bytes */ 199 addi r4,r4,1 200 addi r6,r6,1 201 stb r9,3(r6) 202 bdnz 70b 20361: srwi. r0,r0,2 204 mtctr r0 205 beq 58f 20672: lwzu r9,4(r4) /* do some words */ 207 stwu r9,4(r6) 208 bdnz 72b 209 21058: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 211 clrlwi r5,r5,32-LG_CACHELINE_BYTES 212 li r11,4 213 mtctr r0 214 beq 63f 21553: 216 dcbz r11,r6 217 COPY_16_BYTES 218#if L1_CACHE_BYTES >= 32 219 COPY_16_BYTES 220#if L1_CACHE_BYTES >= 64 221 COPY_16_BYTES 222 COPY_16_BYTES 223#if L1_CACHE_BYTES >= 128 224 COPY_16_BYTES 225 COPY_16_BYTES 226 COPY_16_BYTES 227 COPY_16_BYTES 228#endif 229#endif 230#endif 231 bdnz 53b 232 23363: srwi. r0,r5,2 234 mtctr r0 235 beq 64f 23630: lwzu r0,4(r4) 237 stwu r0,4(r6) 238 bdnz 30b 239 24064: andi. r0,r5,3 241 mtctr r0 242 beq+ 65f 243 addi r4,r4,3 244 addi r6,r6,3 24540: lbzu r0,1(r4) 246 stbu r0,1(r6) 247 bdnz 40b 24865: blr 249EXPORT_SYMBOL(memcpy) 250EXPORT_SYMBOL(memmove) 251EXPORT_SYMBOL_KASAN(memcpy) 252EXPORT_SYMBOL_KASAN(memmove) 253 254generic_memcpy: 255 srwi. r7,r5,3 256 addi r6,r3,-4 257 addi r4,r4,-4 258 beq 2f /* if less than 8 bytes to do */ 259 andi. r0,r6,3 /* get dest word aligned */ 260 mtctr r7 261 bne 5f 2621: lwz r7,4(r4) 263 lwzu r8,8(r4) 264 stw r7,4(r6) 265 stwu r8,8(r6) 266 bdnz 1b 267 andi. r5,r5,7 2682: cmplwi 0,r5,4 269 blt 3f 270 lwzu r0,4(r4) 271 addi r5,r5,-4 272 stwu r0,4(r6) 2733: cmpwi 0,r5,0 274 beqlr 275 mtctr r5 276 addi r4,r4,3 277 addi r6,r6,3 2784: lbzu r0,1(r4) 279 stbu r0,1(r6) 280 bdnz 4b 281 blr 2825: subfic r0,r0,4 283 mtctr r0 2846: lbz r7,4(r4) 285 addi r4,r4,1 286 stb r7,4(r6) 287 addi r6,r6,1 288 bdnz 6b 289 subf r5,r0,r5 290 rlwinm. r7,r5,32-3,3,31 291 beq 2b 292 mtctr r7 293 b 1b 294 295_GLOBAL(backwards_memcpy) 296 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 297 add r6,r3,r5 298 add r4,r4,r5 299 beq 2f 300 andi. r0,r6,3 301 mtctr r7 302 bne 5f 3031: lwz r7,-4(r4) 304 lwzu r8,-8(r4) 305 stw r7,-4(r6) 306 stwu r8,-8(r6) 307 bdnz 1b 308 andi. r5,r5,7 3092: cmplwi 0,r5,4 310 blt 3f 311 lwzu r0,-4(r4) 312 subi r5,r5,4 313 stwu r0,-4(r6) 3143: cmpwi 0,r5,0 315 beqlr 316 mtctr r5 3174: lbzu r0,-1(r4) 318 stbu r0,-1(r6) 319 bdnz 4b 320 blr 3215: mtctr r0 3226: lbzu r7,-1(r4) 323 stbu r7,-1(r6) 324 bdnz 6b 325 subf r5,r0,r5 326 rlwinm. r7,r5,32-3,3,31 327 beq 2b 328 mtctr r7 329 b 1b 330 331_GLOBAL(__copy_tofrom_user) 332 addi r4,r4,-4 333 addi r6,r3,-4 334 neg r0,r3 335 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 336 beq 58f 337 338 cmplw 0,r5,r0 /* is this more than total to do? */ 339 blt 63f /* if not much to do */ 340 andi. r8,r0,3 /* get it word-aligned first */ 341 mtctr r8 342 beq+ 61f 34370: lbz r9,4(r4) /* do some bytes */ 34471: stb r9,4(r6) 345 addi r4,r4,1 346 addi r6,r6,1 347 bdnz 70b 34861: subf r5,r0,r5 349 srwi. r0,r0,2 350 mtctr r0 351 beq 58f 35272: lwzu r9,4(r4) /* do some words */ 35373: stwu r9,4(r6) 354 bdnz 72b 355 356 EX_TABLE(70b,100f) 357 EX_TABLE(71b,101f) 358 EX_TABLE(72b,102f) 359 EX_TABLE(73b,103f) 360 36158: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 362 clrlwi r5,r5,32-LG_CACHELINE_BYTES 363 li r11,4 364 beq 63f 365 366 /* Here we decide how far ahead to prefetch the source */ 367 li r3,4 368 cmpwi r0,1 369 li r7,0 370 ble 114f 371 li r7,1 372#if MAX_COPY_PREFETCH > 1 373 /* Heuristically, for large transfers we prefetch 374 MAX_COPY_PREFETCH cachelines ahead. For small transfers 375 we prefetch 1 cacheline ahead. */ 376 cmpwi r0,MAX_COPY_PREFETCH 377 ble 112f 378 li r7,MAX_COPY_PREFETCH 379112: mtctr r7 380111: dcbt r3,r4 381 addi r3,r3,CACHELINE_BYTES 382 bdnz 111b 383#else 384 dcbt r3,r4 385 addi r3,r3,CACHELINE_BYTES 386#endif /* MAX_COPY_PREFETCH > 1 */ 387 388114: subf r8,r7,r0 389 mr r0,r7 390 mtctr r8 391 39253: dcbt r3,r4 39354: dcbz r11,r6 394 EX_TABLE(54b,105f) 395/* the main body of the cacheline loop */ 396 COPY_16_BYTES_WITHEX(0) 397#if L1_CACHE_BYTES >= 32 398 COPY_16_BYTES_WITHEX(1) 399#if L1_CACHE_BYTES >= 64 400 COPY_16_BYTES_WITHEX(2) 401 COPY_16_BYTES_WITHEX(3) 402#if L1_CACHE_BYTES >= 128 403 COPY_16_BYTES_WITHEX(4) 404 COPY_16_BYTES_WITHEX(5) 405 COPY_16_BYTES_WITHEX(6) 406 COPY_16_BYTES_WITHEX(7) 407#endif 408#endif 409#endif 410 bdnz 53b 411 cmpwi r0,0 412 li r3,4 413 li r7,0 414 bne 114b 415 41663: srwi. r0,r5,2 417 mtctr r0 418 beq 64f 41930: lwzu r0,4(r4) 42031: stwu r0,4(r6) 421 bdnz 30b 422 42364: andi. r0,r5,3 424 mtctr r0 425 beq+ 65f 42640: lbz r0,4(r4) 42741: stb r0,4(r6) 428 addi r4,r4,1 429 addi r6,r6,1 430 bdnz 40b 43165: li r3,0 432 blr 433 434/* read fault, initial single-byte copy */ 435100: li r9,0 436 b 90f 437/* write fault, initial single-byte copy */ 438101: li r9,1 43990: subf r5,r8,r5 440 li r3,0 441 b 99f 442/* read fault, initial word copy */ 443102: li r9,0 444 b 91f 445/* write fault, initial word copy */ 446103: li r9,1 44791: li r3,2 448 b 99f 449 450/* 451 * this stuff handles faults in the cacheline loop and branches to either 452 * 104f (if in read part) or 105f (if in write part), after updating r5 453 */ 454 COPY_16_BYTES_EXCODE(0) 455#if L1_CACHE_BYTES >= 32 456 COPY_16_BYTES_EXCODE(1) 457#if L1_CACHE_BYTES >= 64 458 COPY_16_BYTES_EXCODE(2) 459 COPY_16_BYTES_EXCODE(3) 460#if L1_CACHE_BYTES >= 128 461 COPY_16_BYTES_EXCODE(4) 462 COPY_16_BYTES_EXCODE(5) 463 COPY_16_BYTES_EXCODE(6) 464 COPY_16_BYTES_EXCODE(7) 465#endif 466#endif 467#endif 468 469/* read fault in cacheline loop */ 470104: li r9,0 471 b 92f 472/* fault on dcbz (effectively a write fault) */ 473/* or write fault in cacheline loop */ 474105: li r9,1 47592: li r3,LG_CACHELINE_BYTES 476 mfctr r8 477 add r0,r0,r8 478 b 106f 479/* read fault in final word loop */ 480108: li r9,0 481 b 93f 482/* write fault in final word loop */ 483109: li r9,1 48493: andi. r5,r5,3 485 li r3,2 486 b 99f 487/* read fault in final byte loop */ 488110: li r9,0 489 b 94f 490/* write fault in final byte loop */ 491111: li r9,1 49294: li r5,0 493 li r3,0 494/* 495 * At this stage the number of bytes not copied is 496 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 497 */ 49899: mfctr r0 499106: slw r3,r0,r3 500 add. r3,r3,r5 501 beq 120f /* shouldn't happen */ 502 cmpwi 0,r9,0 503 bne 120f 504/* for a read fault, first try to continue the copy one byte at a time */ 505 mtctr r3 506130: lbz r0,4(r4) 507131: stb r0,4(r6) 508 addi r4,r4,1 509 addi r6,r6,1 510 bdnz 130b 511/* then clear out the destination: r3 bytes starting at 4(r6) */ 512132: mfctr r3 513120: blr 514 515 EX_TABLE(30b,108b) 516 EX_TABLE(31b,109b) 517 EX_TABLE(40b,110b) 518 EX_TABLE(41b,111b) 519 EX_TABLE(130b,132b) 520 EX_TABLE(131b,120b) 521 522EXPORT_SYMBOL(__copy_tofrom_user) 523