1/* memcpy.S: Sparc optimized memcpy and memmove code 2 * Hand optimized from GNU libc's memcpy and memmove 3 * Copyright (C) 1991,1996 Free Software Foundation 4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi) 5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) 7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 8 */ 9 10#define FUNC(x) \ 11 .globl x; \ 12 .type x,@function; \ 13 .align 4; \ 14x: 15 16/* Both these macros have to start with exactly the same insn */ 17#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 18 ldd [%src + (offset) + 0x00], %t0; \ 19 ldd [%src + (offset) + 0x08], %t2; \ 20 ldd [%src + (offset) + 0x10], %t4; \ 21 ldd [%src + (offset) + 0x18], %t6; \ 22 st %t0, [%dst + (offset) + 0x00]; \ 23 st %t1, [%dst + (offset) + 0x04]; \ 24 st %t2, [%dst + (offset) + 0x08]; \ 25 st %t3, [%dst + (offset) + 0x0c]; \ 26 st %t4, [%dst + (offset) + 0x10]; \ 27 st %t5, [%dst + (offset) + 0x14]; \ 28 st %t6, [%dst + (offset) + 0x18]; \ 29 st %t7, [%dst + (offset) + 0x1c]; 30 31#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 32 ldd [%src + (offset) + 0x00], %t0; \ 33 ldd [%src + (offset) + 0x08], %t2; \ 34 ldd [%src + (offset) + 0x10], %t4; \ 35 ldd [%src + (offset) + 0x18], %t6; \ 36 std %t0, [%dst + (offset) + 0x00]; \ 37 std %t2, [%dst + (offset) + 0x08]; \ 38 std %t4, [%dst + (offset) + 0x10]; \ 39 std %t6, [%dst + (offset) + 0x18]; 40 41#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ 42 ldd [%src - (offset) - 0x10], %t0; \ 43 ldd [%src - (offset) - 0x08], %t2; \ 44 st %t0, [%dst - (offset) - 0x10]; \ 45 st %t1, [%dst - (offset) - 0x0c]; \ 46 st %t2, [%dst - (offset) - 0x08]; \ 47 st %t3, [%dst - (offset) - 0x04]; 48 49#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \ 50 ldd [%src - (offset) - 0x10], %t0; \ 51 ldd [%src - (offset) - 0x08], %t2; \ 52 std %t0, [%dst - (offset) - 0x10]; \ 53 std %t2, [%dst - (offset) - 0x08]; 54 55#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ 56 ldub [%src - (offset) - 0x02], %t0; \ 57 ldub [%src - (offset) - 0x01], %t1; \ 58 stb %t0, [%dst - (offset) - 0x02]; \ 59 stb %t1, [%dst - (offset) - 0x01]; 60 61/* Both these macros have to start with exactly the same insn */ 62#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 63 ldd [%src - (offset) - 0x20], %t0; \ 64 ldd [%src - (offset) - 0x18], %t2; \ 65 ldd [%src - (offset) - 0x10], %t4; \ 66 ldd [%src - (offset) - 0x08], %t6; \ 67 st %t0, [%dst - (offset) - 0x20]; \ 68 st %t1, [%dst - (offset) - 0x1c]; \ 69 st %t2, [%dst - (offset) - 0x18]; \ 70 st %t3, [%dst - (offset) - 0x14]; \ 71 st %t4, [%dst - (offset) - 0x10]; \ 72 st %t5, [%dst - (offset) - 0x0c]; \ 73 st %t6, [%dst - (offset) - 0x08]; \ 74 st %t7, [%dst - (offset) - 0x04]; 75 76#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 77 ldd [%src - (offset) - 0x20], %t0; \ 78 ldd [%src - (offset) - 0x18], %t2; \ 79 ldd [%src - (offset) - 0x10], %t4; \ 80 ldd [%src - (offset) - 0x08], %t6; \ 81 std %t0, [%dst - (offset) - 0x20]; \ 82 std %t2, [%dst - (offset) - 0x18]; \ 83 std %t4, [%dst - (offset) - 0x10]; \ 84 std %t6, [%dst - (offset) - 0x08]; 85 86#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ 87 ldd [%src + (offset) + 0x00], %t0; \ 88 ldd [%src + (offset) + 0x08], %t2; \ 89 st %t0, [%dst + (offset) + 0x00]; \ 90 st %t1, [%dst + (offset) + 0x04]; \ 91 st %t2, [%dst + (offset) + 0x08]; \ 92 st %t3, [%dst + (offset) + 0x0c]; 93 94#define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ 95 ldub [%src + (offset) + 0x00], %t0; \ 96 ldub [%src + (offset) + 0x01], %t1; \ 97 stb %t0, [%dst + (offset) + 0x00]; \ 98 stb %t1, [%dst + (offset) + 0x01]; 99 100#define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \ 101 ldd [%src + (offset) + 0x00], %t0; \ 102 ldd [%src + (offset) + 0x08], %t2; \ 103 srl %t0, shir, %t5; \ 104 srl %t1, shir, %t6; \ 105 sll %t0, shil, %t0; \ 106 or %t5, %prev, %t5; \ 107 sll %t1, shil, %prev; \ 108 or %t6, %t0, %t0; \ 109 srl %t2, shir, %t1; \ 110 srl %t3, shir, %t6; \ 111 sll %t2, shil, %t2; \ 112 or %t1, %prev, %t1; \ 113 std %t4, [%dst + (offset) + (offset2) - 0x04]; \ 114 std %t0, [%dst + (offset) + (offset2) + 0x04]; \ 115 sll %t3, shil, %prev; \ 116 or %t6, %t2, %t4; 117 118#define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \ 119 ldd [%src + (offset) + 0x00], %t0; \ 120 ldd [%src + (offset) + 0x08], %t2; \ 121 srl %t0, shir, %t4; \ 122 srl %t1, shir, %t5; \ 123 sll %t0, shil, %t6; \ 124 or %t4, %prev, %t0; \ 125 sll %t1, shil, %prev; \ 126 or %t5, %t6, %t1; \ 127 srl %t2, shir, %t4; \ 128 srl %t3, shir, %t5; \ 129 sll %t2, shil, %t6; \ 130 or %t4, %prev, %t2; \ 131 sll %t3, shil, %prev; \ 132 or %t5, %t6, %t3; \ 133 std %t0, [%dst + (offset) + (offset2) + 0x00]; \ 134 std %t2, [%dst + (offset) + (offset2) + 0x08]; 135 136 .text 137 .align 4 138 1390: 140 retl 141 nop ! Only bcopy returns here and it retuns void... 142 143#ifdef __KERNEL__ 144FUNC(amemmove) 145FUNC(__memmove) 146#endif 147FUNC(memmove) 148 cmp %o0, %o1 149 mov %o0, %g7 150 bleu 9f 151 sub %o0, %o1, %o4 152 153 add %o1, %o2, %o3 154 cmp %o3, %o0 155 bleu 0f 156 andcc %o4, 3, %o5 157 158 add %o1, %o2, %o1 159 add %o0, %o2, %o0 160 sub %o1, 1, %o1 161 sub %o0, 1, %o0 162 1631: /* reverse_bytes */ 164 165 ldub [%o1], %o4 166 subcc %o2, 1, %o2 167 stb %o4, [%o0] 168 sub %o1, 1, %o1 169 bne 1b 170 sub %o0, 1, %o0 171 172 retl 173 mov %g7, %o0 174 175/* NOTE: This code is executed just for the cases, 176 where %src (=%o1) & 3 is != 0. 177 We need to align it to 4. So, for (%src & 3) 178 1 we need to do ldub,lduh 179 2 lduh 180 3 just ldub 181 so even if it looks weird, the branches 182 are correct here. -jj 183 */ 18478: /* dword_align */ 185 186 andcc %o1, 1, %g0 187 be 4f 188 andcc %o1, 2, %g0 189 190 ldub [%o1], %g2 191 add %o1, 1, %o1 192 stb %g2, [%o0] 193 sub %o2, 1, %o2 194 bne 3f 195 add %o0, 1, %o0 1964: 197 lduh [%o1], %g2 198 add %o1, 2, %o1 199 sth %g2, [%o0] 200 sub %o2, 2, %o2 201 b 3f 202 add %o0, 2, %o0 203 204FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ 205 206 sub %o0, %o1, %o4 207 mov %o0, %g7 2089: 209 andcc %o4, 3, %o5 2100: 211 bne 86f 212 cmp %o2, 15 213 214 bleu 90f 215 andcc %o1, 3, %g0 216 217 bne 78b 2183: 219 andcc %o1, 4, %g0 220 221 be 2f 222 mov %o2, %g1 223 224 ld [%o1], %o4 225 sub %g1, 4, %g1 226 st %o4, [%o0] 227 add %o1, 4, %o1 228 add %o0, 4, %o0 2292: 230 andcc %g1, 0xffffff80, %g0 231 be 3f 232 andcc %o0, 4, %g0 233 234 be 82f + 4 2355: 236 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) 237 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) 238 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) 239 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) 240 sub %g1, 128, %g1 241 add %o1, 128, %o1 242 cmp %g1, 128 243 bge 5b 244 add %o0, 128, %o0 2453: 246 andcc %g1, 0x70, %g4 247 be 80f 248 andcc %g1, 8, %g0 249 250 sethi %hi(80f), %o5 251 srl %g4, 1, %o4 252 add %g4, %o4, %o4 253 add %o1, %g4, %o1 254 sub %o5, %o4, %o5 255 jmpl %o5 + %lo(80f), %g0 256 add %o0, %g4, %o0 257 25879: /* memcpy_table */ 259 260 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) 261 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) 262 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) 263 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) 264 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) 265 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) 266 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) 267 26880: /* memcpy_table_end */ 269 be 81f 270 andcc %g1, 4, %g0 271 272 ldd [%o1], %g2 273 add %o0, 8, %o0 274 st %g2, [%o0 - 0x08] 275 add %o1, 8, %o1 276 st %g3, [%o0 - 0x04] 277 27881: /* memcpy_last7 */ 279 280 be 1f 281 andcc %g1, 2, %g0 282 283 ld [%o1], %g2 284 add %o1, 4, %o1 285 st %g2, [%o0] 286 add %o0, 4, %o0 2871: 288 be 1f 289 andcc %g1, 1, %g0 290 291 lduh [%o1], %g2 292 add %o1, 2, %o1 293 sth %g2, [%o0] 294 add %o0, 2, %o0 2951: 296 be 1f 297 nop 298 299 ldub [%o1], %g2 300 stb %g2, [%o0] 3011: 302 retl 303 mov %g7, %o0 304 30582: /* ldd_std */ 306 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) 307 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) 308 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) 309 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) 310 subcc %g1, 128, %g1 311 add %o1, 128, %o1 312 cmp %g1, 128 313 bge 82b 314 add %o0, 128, %o0 315 316 andcc %g1, 0x70, %g4 317 be 84f 318 andcc %g1, 8, %g0 319 320 sethi %hi(84f), %o5 321 add %o1, %g4, %o1 322 sub %o5, %g4, %o5 323 jmpl %o5 + %lo(84f), %g0 324 add %o0, %g4, %o0 325 32683: /* amemcpy_table */ 327 328 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5) 329 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5) 330 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5) 331 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5) 332 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5) 333 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5) 334 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5) 335 33684: /* amemcpy_table_end */ 337 be 85f 338 andcc %g1, 4, %g0 339 340 ldd [%o1], %g2 341 add %o0, 8, %o0 342 std %g2, [%o0 - 0x08] 343 add %o1, 8, %o1 34485: /* amemcpy_last7 */ 345 be 1f 346 andcc %g1, 2, %g0 347 348 ld [%o1], %g2 349 add %o1, 4, %o1 350 st %g2, [%o0] 351 add %o0, 4, %o0 3521: 353 be 1f 354 andcc %g1, 1, %g0 355 356 lduh [%o1], %g2 357 add %o1, 2, %o1 358 sth %g2, [%o0] 359 add %o0, 2, %o0 3601: 361 be 1f 362 nop 363 364 ldub [%o1], %g2 365 stb %g2, [%o0] 3661: 367 retl 368 mov %g7, %o0 369 37086: /* non_aligned */ 371 cmp %o2, 6 372 bleu 88f 373 nop 374 375 save %sp, -96, %sp 376 andcc %i0, 3, %g0 377 be 61f 378 andcc %i0, 1, %g0 379 be 60f 380 andcc %i0, 2, %g0 381 382 ldub [%i1], %g5 383 add %i1, 1, %i1 384 stb %g5, [%i0] 385 sub %i2, 1, %i2 386 bne 61f 387 add %i0, 1, %i0 38860: 389 ldub [%i1], %g3 390 add %i1, 2, %i1 391 stb %g3, [%i0] 392 sub %i2, 2, %i2 393 ldub [%i1 - 1], %g3 394 add %i0, 2, %i0 395 stb %g3, [%i0 - 1] 39661: 397 and %i1, 3, %g2 398 and %i2, 0xc, %g3 399 and %i1, -4, %i1 400 cmp %g3, 4 401 sll %g2, 3, %g4 402 mov 32, %g2 403 be 4f 404 sub %g2, %g4, %l0 405 406 blu 3f 407 cmp %g3, 0x8 408 409 be 2f 410 srl %i2, 2, %g3 411 412 ld [%i1], %i3 413 add %i0, -8, %i0 414 ld [%i1 + 4], %i4 415 b 8f 416 add %g3, 1, %g3 4172: 418 ld [%i1], %i4 419 add %i0, -12, %i0 420 ld [%i1 + 4], %i5 421 add %g3, 2, %g3 422 b 9f 423 add %i1, -4, %i1 4243: 425 ld [%i1], %g1 426 add %i0, -4, %i0 427 ld [%i1 + 4], %i3 428 srl %i2, 2, %g3 429 b 7f 430 add %i1, 4, %i1 4314: 432 ld [%i1], %i5 433 cmp %i2, 7 434 ld [%i1 + 4], %g1 435 srl %i2, 2, %g3 436 bleu 10f 437 add %i1, 8, %i1 438 439 ld [%i1], %i3 440 add %g3, -1, %g3 4415: 442 sll %i5, %g4, %g2 443 srl %g1, %l0, %g5 444 or %g2, %g5, %g2 445 st %g2, [%i0] 4467: 447 ld [%i1 + 4], %i4 448 sll %g1, %g4, %g2 449 srl %i3, %l0, %g5 450 or %g2, %g5, %g2 451 st %g2, [%i0 + 4] 4528: 453 ld [%i1 + 8], %i5 454 sll %i3, %g4, %g2 455 srl %i4, %l0, %g5 456 or %g2, %g5, %g2 457 st %g2, [%i0 + 8] 4589: 459 ld [%i1 + 12], %g1 460 sll %i4, %g4, %g2 461 srl %i5, %l0, %g5 462 addcc %g3, -4, %g3 463 or %g2, %g5, %g2 464 add %i1, 16, %i1 465 st %g2, [%i0 + 12] 466 add %i0, 16, %i0 467 bne,a 5b 468 ld [%i1], %i3 46910: 470 sll %i5, %g4, %g2 471 srl %g1, %l0, %g5 472 srl %l0, 3, %g3 473 or %g2, %g5, %g2 474 sub %i1, %g3, %i1 475 andcc %i2, 2, %g0 476 st %g2, [%i0] 477 be 1f 478 andcc %i2, 1, %g0 479 480 ldub [%i1], %g2 481 add %i1, 2, %i1 482 stb %g2, [%i0 + 4] 483 add %i0, 2, %i0 484 ldub [%i1 - 1], %g2 485 stb %g2, [%i0 + 3] 4861: 487 be 1f 488 nop 489 ldub [%i1], %g2 490 stb %g2, [%i0 + 4] 4911: 492 ret 493 restore %g7, %g0, %o0 494 49588: /* short_end */ 496 497 and %o2, 0xe, %o3 49820: 499 sethi %hi(89f), %o5 500 sll %o3, 3, %o4 501 add %o0, %o3, %o0 502 sub %o5, %o4, %o5 503 add %o1, %o3, %o1 504 jmpl %o5 + %lo(89f), %g0 505 andcc %o2, 1, %g0 506 507 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) 508 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) 509 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) 510 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) 511 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) 512 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) 513 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) 514 51589: /* short_table_end */ 516 517 be 1f 518 nop 519 520 ldub [%o1], %g2 521 stb %g2, [%o0] 5221: 523 retl 524 mov %g7, %o0 525 52690: /* short_aligned_end */ 527 bne 88b 528 andcc %o2, 8, %g0 529 530 be 1f 531 andcc %o2, 4, %g0 532 533 ld [%o1 + 0x00], %g2 534 ld [%o1 + 0x04], %g3 535 add %o1, 8, %o1 536 st %g2, [%o0 + 0x00] 537 st %g3, [%o0 + 0x04] 538 add %o0, 8, %o0 5391: 540 b 81b 541 mov %o2, %g1 542