1 /* 2 * Optimized memory copy routines. 3 * 4 * Copyright (C) 2004 Randolph Chung <tausq@debian.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2, or (at your option) 9 * any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 * 20 * Portions derived from the GNU C Library 21 * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. 22 * 23 * Several strategies are tried to try to get the best performance for various 24 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using 25 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using 26 * general registers. Unaligned copies are handled either by aligning the 27 * destination and then using shift-and-write method, or in a few cases by 28 * falling back to a byte-at-a-time copy. 29 * 30 * I chose to implement this in C because it is easier to maintain and debug, 31 * and in my experiments it appears that the C code generated by gcc (3.3/3.4 32 * at the time of writing) is fairly optimal. Unfortunately some of the 33 * semantics of the copy routine (exception handling) is difficult to express 34 * in C, so we have to play some tricks to get it to work. 35 * 36 * All the loads and stores are done via explicit asm() code in order to use 37 * the right space registers. 38 * 39 * Testing with various alignments and buffer sizes shows that this code is 40 * often >10x faster than a simple byte-at-a-time copy, even for strangely 41 * aligned operands. It is interesting to note that the glibc version 42 * of memcpy (written in C) is actually quite fast already. This routine is 43 * able to beat it by 30-40% for aligned copies because of the loop unrolling, 44 * but in some cases the glibc version is still slightly faster. This lends 45 * more credibility that gcc can generate very good code as long as we are 46 * careful. 47 * 48 * TODO: 49 * - cache prefetching needs more experimentation to get optimal settings 50 * - try not to use the post-increment address modifiers; they create additional 51 * interlocks 52 * - replace byte-copy loops with stybs sequences 53 */ 54 55 #ifdef __KERNEL__ 56 #include <linux/config.h> 57 #include <linux/module.h> 58 #include <linux/compiler.h> 59 #include <asm/uaccess.h> 60 #define s_space "%%sr1" 61 #define d_space "%%sr2" 62 #else 63 #include "memcpy.h" 64 #define s_space "%%sr0" 65 #define d_space "%%sr0" 66 #define pa_memcpy new2_copy 67 #endif 68 69 DECLARE_PER_CPU(struct exception_data, exception_data); 70 71 #define preserve_branch(label) do { \ 72 volatile int dummy; \ 73 /* The following branch is never taken, it's just here to */ \ 74 /* prevent gcc from optimizing away our exception code. */ \ 75 if (unlikely(dummy != dummy)) \ 76 goto label; \ 77 } while (0) 78 79 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3)) 80 #define get_kernel_space() (0) 81 82 #define MERGE(w0, sh_1, w1, sh_2) ({ \ 83 unsigned int _r; \ 84 asm volatile ( \ 85 "mtsar %3\n" \ 86 "shrpw %1, %2, %%sar, %0\n" \ 87 : "=r"(_r) \ 88 : "r"(w0), "r"(w1), "r"(sh_2) \ 89 ); \ 90 _r; \ 91 }) 92 #define THRESHOLD 16 93 94 #ifdef DEBUG_MEMCPY 95 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0) 96 #else 97 #define DPRINTF(fmt, args...) 98 #endif 99 100 #ifndef __LP64__ 101 #define EXC_WORD ".word" 102 #else 103 #define EXC_WORD ".dword" 104 #endif 105 106 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ 107 __asm__ __volatile__ ( \ 108 "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" \ 109 "\t.section __ex_table,\"aw\"\n" \ 110 "\t" EXC_WORD "\t1b\n" \ 111 "\t" EXC_WORD "\t" #_e "\n" \ 112 "\t.previous\n" \ 113 : _tt(_t), "+r"(_a) \ 114 : \ 115 : "r8") 116 117 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ 118 __asm__ __volatile__ ( \ 119 "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" \ 120 "\t.section __ex_table,\"aw\"\n" \ 121 "\t" EXC_WORD "\t1b\n" \ 122 "\t" EXC_WORD "\t" #_e "\n" \ 123 "\t.previous\n" \ 124 : "+r"(_a) \ 125 : _tt(_t) \ 126 : "r8") 127 128 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e) 129 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e) 130 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e) 131 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e) 132 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e) 133 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e) 134 135 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \ 136 __asm__ __volatile__ ( \ 137 "1:\t" #_insn " " #_o "(" _s ",%1), %0\n" \ 138 "\t.section __ex_table,\"aw\"\n" \ 139 "\t" EXC_WORD "\t1b\n" \ 140 "\t" EXC_WORD "\t" #_e "\n" \ 141 "\t.previous\n" \ 142 : _tt(_t) \ 143 : "r"(_a) \ 144 : "r8") 145 146 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \ 147 __asm__ __volatile__ ( \ 148 "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" \ 149 "\t.section __ex_table,\"aw\"\n" \ 150 "\t" EXC_WORD "\t1b\n" \ 151 "\t" EXC_WORD "\t" #_e "\n" \ 152 "\t.previous\n" \ 153 : \ 154 : _tt(_t), "r"(_a) \ 155 : "r8") 156 157 #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e) 158 #define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e) 159 160 #ifdef CONFIG_PREFETCH 161 extern inline void prefetch_src(const void *addr) 162 { 163 __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr)); 164 } 165 166 extern inline void prefetch_dst(const void *addr) 167 { 168 __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr)); 169 } 170 #else 171 #define prefetch_src(addr) 172 #define prefetch_dst(addr) 173 #endif 174 175 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words 176 * per loop. This code is derived from glibc. 177 */ 178 static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len) 179 { 180 /* gcc complains that a2 and a3 may be uninitialized, but actually 181 * they cannot be. Initialize a2/a3 to shut gcc up. 182 */ 183 register unsigned int a0, a1, a2 = 0, a3 = 0; 184 int sh_1, sh_2; 185 struct exception_data *d; 186 187 /* prefetch_src((const void *)src); */ 188 189 /* Calculate how to shift a word read at the memory operation 190 aligned srcp to make it aligned for copy. */ 191 sh_1 = 8 * (src % sizeof(unsigned int)); 192 sh_2 = 8 * sizeof(unsigned int) - sh_1; 193 194 /* Make src aligned by rounding it down. */ 195 src &= -sizeof(unsigned int); 196 197 switch (len % 4) 198 { 199 case 2: 200 /* a1 = ((unsigned int *) src)[0]; 201 a2 = ((unsigned int *) src)[1]; */ 202 ldw(s_space, 0, src, a1, cda_ldw_exc); 203 ldw(s_space, 4, src, a2, cda_ldw_exc); 204 src -= 1 * sizeof(unsigned int); 205 dst -= 3 * sizeof(unsigned int); 206 len += 2; 207 goto do1; 208 case 3: 209 /* a0 = ((unsigned int *) src)[0]; 210 a1 = ((unsigned int *) src)[1]; */ 211 ldw(s_space, 0, src, a0, cda_ldw_exc); 212 ldw(s_space, 4, src, a1, cda_ldw_exc); 213 src -= 0 * sizeof(unsigned int); 214 dst -= 2 * sizeof(unsigned int); 215 len += 1; 216 goto do2; 217 case 0: 218 if (len == 0) 219 return 0; 220 /* a3 = ((unsigned int *) src)[0]; 221 a0 = ((unsigned int *) src)[1]; */ 222 ldw(s_space, 0, src, a3, cda_ldw_exc); 223 ldw(s_space, 4, src, a0, cda_ldw_exc); 224 src -=-1 * sizeof(unsigned int); 225 dst -= 1 * sizeof(unsigned int); 226 len += 0; 227 goto do3; 228 case 1: 229 /* a2 = ((unsigned int *) src)[0]; 230 a3 = ((unsigned int *) src)[1]; */ 231 ldw(s_space, 0, src, a2, cda_ldw_exc); 232 ldw(s_space, 4, src, a3, cda_ldw_exc); 233 src -=-2 * sizeof(unsigned int); 234 dst -= 0 * sizeof(unsigned int); 235 len -= 1; 236 if (len == 0) 237 goto do0; 238 goto do4; /* No-op. */ 239 } 240 241 do 242 { 243 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */ 244 do4: 245 /* a0 = ((unsigned int *) src)[0]; */ 246 ldw(s_space, 0, src, a0, cda_ldw_exc); 247 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ 248 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); 249 do3: 250 /* a1 = ((unsigned int *) src)[1]; */ 251 ldw(s_space, 4, src, a1, cda_ldw_exc); 252 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */ 253 stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc); 254 do2: 255 /* a2 = ((unsigned int *) src)[2]; */ 256 ldw(s_space, 8, src, a2, cda_ldw_exc); 257 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */ 258 stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc); 259 do1: 260 /* a3 = ((unsigned int *) src)[3]; */ 261 ldw(s_space, 12, src, a3, cda_ldw_exc); 262 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */ 263 stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc); 264 265 src += 4 * sizeof(unsigned int); 266 dst += 4 * sizeof(unsigned int); 267 len -= 4; 268 } 269 while (len != 0); 270 271 do0: 272 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ 273 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); 274 275 preserve_branch(handle_load_error); 276 preserve_branch(handle_store_error); 277 278 return 0; 279 280 handle_load_error: 281 __asm__ __volatile__ ("cda_ldw_exc:\n"); 282 d = &__get_cpu_var(exception_data); 283 DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n", 284 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src); 285 return o_len * 4 - d->fault_addr + o_src; 286 287 handle_store_error: 288 __asm__ __volatile__ ("cda_stw_exc:\n"); 289 d = &__get_cpu_var(exception_data); 290 DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n", 291 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst); 292 return o_len * 4 - d->fault_addr + o_dst; 293 } 294 295 296 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ 297 unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) 298 { 299 register unsigned long src, dst, t1, t2, t3; 300 register unsigned char *pcs, *pcd; 301 register unsigned int *pws, *pwd; 302 register double *pds, *pdd; 303 unsigned long ret = 0; 304 unsigned long o_dst, o_src, o_len; 305 struct exception_data *d; 306 307 src = (unsigned long)srcp; 308 dst = (unsigned long)dstp; 309 pcs = (unsigned char *)srcp; 310 pcd = (unsigned char *)dstp; 311 312 o_dst = dst; o_src = src; o_len = len; 313 314 /* prefetch_src((const void *)srcp); */ 315 316 if (len < THRESHOLD) 317 goto byte_copy; 318 319 /* Check alignment */ 320 t1 = (src ^ dst); 321 if (unlikely(t1 & (sizeof(double)-1))) 322 goto unaligned_copy; 323 324 /* src and dst have same alignment. */ 325 326 /* Copy bytes till we are double-aligned. */ 327 t2 = src & (sizeof(double) - 1); 328 if (unlikely(t2 != 0)) { 329 t2 = sizeof(double) - t2; 330 while (t2 && len) { 331 /* *pcd++ = *pcs++; */ 332 ldbma(s_space, pcs, t3, pmc_load_exc); 333 len--; 334 stbma(d_space, t3, pcd, pmc_store_exc); 335 t2--; 336 } 337 } 338 339 pds = (double *)pcs; 340 pdd = (double *)pcd; 341 342 #if 0 343 /* Copy 8 doubles at a time */ 344 while (len >= 8*sizeof(double)) { 345 register double r1, r2, r3, r4, r5, r6, r7, r8; 346 /* prefetch_src((char *)pds + L1_CACHE_BYTES); */ 347 flddma(s_space, pds, r1, pmc_load_exc); 348 flddma(s_space, pds, r2, pmc_load_exc); 349 flddma(s_space, pds, r3, pmc_load_exc); 350 flddma(s_space, pds, r4, pmc_load_exc); 351 fstdma(d_space, r1, pdd, pmc_store_exc); 352 fstdma(d_space, r2, pdd, pmc_store_exc); 353 fstdma(d_space, r3, pdd, pmc_store_exc); 354 fstdma(d_space, r4, pdd, pmc_store_exc); 355 356 #if 0 357 if (L1_CACHE_BYTES <= 32) 358 prefetch_src((char *)pds + L1_CACHE_BYTES); 359 #endif 360 flddma(s_space, pds, r5, pmc_load_exc); 361 flddma(s_space, pds, r6, pmc_load_exc); 362 flddma(s_space, pds, r7, pmc_load_exc); 363 flddma(s_space, pds, r8, pmc_load_exc); 364 fstdma(d_space, r5, pdd, pmc_store_exc); 365 fstdma(d_space, r6, pdd, pmc_store_exc); 366 fstdma(d_space, r7, pdd, pmc_store_exc); 367 fstdma(d_space, r8, pdd, pmc_store_exc); 368 len -= 8*sizeof(double); 369 } 370 #endif 371 372 pws = (unsigned int *)pds; 373 pwd = (unsigned int *)pdd; 374 375 word_copy: 376 while (len >= 8*sizeof(unsigned int)) { 377 register unsigned int r1,r2,r3,r4,r5,r6,r7,r8; 378 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */ 379 ldwma(s_space, pws, r1, pmc_load_exc); 380 ldwma(s_space, pws, r2, pmc_load_exc); 381 ldwma(s_space, pws, r3, pmc_load_exc); 382 ldwma(s_space, pws, r4, pmc_load_exc); 383 stwma(d_space, r1, pwd, pmc_store_exc); 384 stwma(d_space, r2, pwd, pmc_store_exc); 385 stwma(d_space, r3, pwd, pmc_store_exc); 386 stwma(d_space, r4, pwd, pmc_store_exc); 387 388 ldwma(s_space, pws, r5, pmc_load_exc); 389 ldwma(s_space, pws, r6, pmc_load_exc); 390 ldwma(s_space, pws, r7, pmc_load_exc); 391 ldwma(s_space, pws, r8, pmc_load_exc); 392 stwma(d_space, r5, pwd, pmc_store_exc); 393 stwma(d_space, r6, pwd, pmc_store_exc); 394 stwma(d_space, r7, pwd, pmc_store_exc); 395 stwma(d_space, r8, pwd, pmc_store_exc); 396 len -= 8*sizeof(unsigned int); 397 } 398 399 while (len >= 4*sizeof(unsigned int)) { 400 register unsigned int r1,r2,r3,r4; 401 ldwma(s_space, pws, r1, pmc_load_exc); 402 ldwma(s_space, pws, r2, pmc_load_exc); 403 ldwma(s_space, pws, r3, pmc_load_exc); 404 ldwma(s_space, pws, r4, pmc_load_exc); 405 stwma(d_space, r1, pwd, pmc_store_exc); 406 stwma(d_space, r2, pwd, pmc_store_exc); 407 stwma(d_space, r3, pwd, pmc_store_exc); 408 stwma(d_space, r4, pwd, pmc_store_exc); 409 len -= 4*sizeof(unsigned int); 410 } 411 412 pcs = (unsigned char *)pws; 413 pcd = (unsigned char *)pwd; 414 415 byte_copy: 416 while (len) { 417 /* *pcd++ = *pcs++; */ 418 ldbma(s_space, pcs, t3, pmc_load_exc); 419 stbma(d_space, t3, pcd, pmc_store_exc); 420 len--; 421 } 422 423 return 0; 424 425 unaligned_copy: 426 /* possibly we are aligned on a word, but not on a double... */ 427 if (likely(t1 & (sizeof(unsigned int)-1)) == 0) { 428 t2 = src & (sizeof(unsigned int) - 1); 429 430 if (unlikely(t2 != 0)) { 431 t2 = sizeof(unsigned int) - t2; 432 while (t2) { 433 /* *pcd++ = *pcs++; */ 434 ldbma(s_space, pcs, t3, pmc_load_exc); 435 stbma(d_space, t3, pcd, pmc_store_exc); 436 len--; 437 t2--; 438 } 439 } 440 441 pws = (unsigned int *)pcs; 442 pwd = (unsigned int *)pcd; 443 goto word_copy; 444 } 445 446 /* Align the destination. */ 447 if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) { 448 t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1)); 449 while (t2) { 450 /* *pcd++ = *pcs++; */ 451 ldbma(s_space, pcs, t3, pmc_load_exc); 452 stbma(d_space, t3, pcd, pmc_store_exc); 453 len--; 454 t2--; 455 } 456 dst = (unsigned long)pcd; 457 src = (unsigned long)pcs; 458 } 459 460 ret = copy_dstaligned(dst, src, len / sizeof(unsigned int), 461 o_dst, o_src, o_len); 462 if (ret) 463 return ret; 464 465 pcs += (len & -sizeof(unsigned int)); 466 pcd += (len & -sizeof(unsigned int)); 467 len %= sizeof(unsigned int); 468 469 preserve_branch(handle_load_error); 470 preserve_branch(handle_store_error); 471 472 goto byte_copy; 473 474 handle_load_error: 475 __asm__ __volatile__ ("pmc_load_exc:\n"); 476 d = &__get_cpu_var(exception_data); 477 DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n", 478 o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src); 479 return o_len - d->fault_addr + o_src; 480 481 handle_store_error: 482 __asm__ __volatile__ ("pmc_store_exc:\n"); 483 d = &__get_cpu_var(exception_data); 484 DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n", 485 o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst); 486 return o_len - d->fault_addr + o_dst; 487 } 488 489 #ifdef __KERNEL__ 490 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len) 491 { 492 mtsp(get_kernel_space(), 1); 493 mtsp(get_user_space(), 2); 494 return pa_memcpy((void __force *)dst, src, len); 495 } 496 497 unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len) 498 { 499 mtsp(get_user_space(), 1); 500 mtsp(get_kernel_space(), 2); 501 return pa_memcpy(dst, (void __force *)src, len); 502 } 503 504 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len) 505 { 506 mtsp(get_user_space(), 1); 507 mtsp(get_user_space(), 2); 508 return pa_memcpy((void __force *)dst, (void __force *)src, len); 509 } 510 511 512 void * memcpy(void * dst,const void *src, size_t count) 513 { 514 mtsp(get_kernel_space(), 1); 515 mtsp(get_kernel_space(), 2); 516 pa_memcpy(dst, src, count); 517 return dst; 518 } 519 520 EXPORT_SYMBOL(copy_to_user); 521 EXPORT_SYMBOL(copy_from_user); 522 EXPORT_SYMBOL(copy_in_user); 523 EXPORT_SYMBOL(memcpy); 524 #endif 525