1/* U3memcpy.S: UltraSparc-III optimized memcpy. 2 * 3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) 4 */ 5 6#ifdef __KERNEL__ 7#include <linux/linkage.h> 8#include <asm/visasm.h> 9#include <asm/asi.h> 10#define GLOBAL_SPARE %g7 11#else 12#define ASI_BLK_P 0xf0 13#define FPRS_FEF 0x04 14#ifdef MEMCPY_DEBUG 15#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 16 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 17#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 18#else 19#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 20#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 21#endif 22#define GLOBAL_SPARE %g5 23#endif 24 25#ifndef EX_LD 26#define EX_LD(x,y) x 27#endif 28#ifndef EX_LD_FP 29#define EX_LD_FP(x,y) x 30#endif 31 32#ifndef EX_ST 33#define EX_ST(x,y) x 34#endif 35#ifndef EX_ST_FP 36#define EX_ST_FP(x,y) x 37#endif 38 39#ifndef LOAD 40#define LOAD(type,addr,dest) type [addr], dest 41#endif 42 43#ifndef STORE 44#define STORE(type,src,addr) type src, [addr] 45#endif 46 47#ifndef STORE_BLK 48#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 49#endif 50 51#ifndef FUNC_NAME 52#define FUNC_NAME U3memcpy 53#endif 54 55#ifndef PREAMBLE 56#define PREAMBLE 57#endif 58 59#ifndef XCC 60#define XCC xcc 61#endif 62 63 .register %g2,#scratch 64 .register %g3,#scratch 65 66 /* Special/non-trivial issues of this code: 67 * 68 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf 69 * 2) Only low 32 FPU registers are used so that only the 70 * lower half of the FPU register set is dirtied by this 71 * code. This is especially important in the kernel. 72 * 3) This code never prefetches cachelines past the end 73 * of the source buffer. 74 */ 75 76 .text 77#ifndef EX_RETVAL 78#define EX_RETVAL(x) x 79__restore_fp: 80 VISExitHalf 81 retl 82 nop 83ENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp) 84 add %g1, 1, %g1 85 add %g2, %g1, %g2 86 ba,pt %xcc, __restore_fp 87 add %o2, %g2, %o0 88ENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp) 89ENTRY(U3_retl_o2_plus_g2_fp) 90 ba,pt %xcc, __restore_fp 91 add %o2, %g2, %o0 92ENDPROC(U3_retl_o2_plus_g2_fp) 93ENTRY(U3_retl_o2_plus_g2_plus_8_fp) 94 add %g2, 8, %g2 95 ba,pt %xcc, __restore_fp 96 add %o2, %g2, %o0 97ENDPROC(U3_retl_o2_plus_g2_plus_8_fp) 98ENTRY(U3_retl_o2) 99 retl 100 mov %o2, %o0 101ENDPROC(U3_retl_o2) 102ENTRY(U3_retl_o2_plus_1) 103 retl 104 add %o2, 1, %o0 105ENDPROC(U3_retl_o2_plus_1) 106ENTRY(U3_retl_o2_plus_4) 107 retl 108 add %o2, 4, %o0 109ENDPROC(U3_retl_o2_plus_4) 110ENTRY(U3_retl_o2_plus_8) 111 retl 112 add %o2, 8, %o0 113ENDPROC(U3_retl_o2_plus_8) 114ENTRY(U3_retl_o2_plus_g1_plus_1) 115 add %g1, 1, %g1 116 retl 117 add %o2, %g1, %o0 118ENDPROC(U3_retl_o2_plus_g1_plus_1) 119ENTRY(U3_retl_o2_fp) 120 ba,pt %xcc, __restore_fp 121 mov %o2, %o0 122ENDPROC(U3_retl_o2_fp) 123ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp) 124 sll %o3, 6, %o3 125 add %o3, 0x80, %o3 126 ba,pt %xcc, __restore_fp 127 add %o2, %o3, %o0 128ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp) 129ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp) 130 sll %o3, 6, %o3 131 add %o3, 0x40, %o3 132 ba,pt %xcc, __restore_fp 133 add %o2, %o3, %o0 134ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp) 135ENTRY(U3_retl_o2_plus_GS_plus_0x10) 136 add GLOBAL_SPARE, 0x10, GLOBAL_SPARE 137 retl 138 add %o2, GLOBAL_SPARE, %o0 139ENDPROC(U3_retl_o2_plus_GS_plus_0x10) 140ENTRY(U3_retl_o2_plus_GS_plus_0x08) 141 add GLOBAL_SPARE, 0x08, GLOBAL_SPARE 142 retl 143 add %o2, GLOBAL_SPARE, %o0 144ENDPROC(U3_retl_o2_plus_GS_plus_0x08) 145ENTRY(U3_retl_o2_and_7_plus_GS) 146 and %o2, 7, %o2 147 retl 148 add %o2, GLOBAL_SPARE, %o0 149ENDPROC(U3_retl_o2_and_7_plus_GS) 150ENTRY(U3_retl_o2_and_7_plus_GS_plus_8) 151 add GLOBAL_SPARE, 8, GLOBAL_SPARE 152 and %o2, 7, %o2 153 retl 154 add %o2, GLOBAL_SPARE, %o0 155ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8) 156#endif 157 158 .align 64 159 160 /* The cheetah's flexible spine, oversized liver, enlarged heart, 161 * slender muscular body, and claws make it the swiftest hunter 162 * in Africa and the fastest animal on land. Can reach speeds 163 * of up to 2.4GB per second. 164 */ 165 166 .globl FUNC_NAME 167 .type FUNC_NAME,#function 168FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 169 srlx %o2, 31, %g2 170 cmp %g2, 0 171 172 /* software trap 5 "Range Check" if dst >= 0x80000000 */ 173 tne %xcc, 5 174 PREAMBLE 175 mov %o0, %o4 176 177 /* if len == 0 */ 178 cmp %o2, 0 179 be,pn %XCC, end_return 180 or %o0, %o1, %o3 181 182 /* if len < 16 */ 183 cmp %o2, 16 184 blu,a,pn %XCC, less_than_16 185 or %o3, %o2, %o3 186 187 /* if len < 192 */ 188 cmp %o2, (3 * 64) 189 blu,pt %XCC, less_than_192 190 andcc %o3, 0x7, %g0 191 192 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve 193 * o5 from here until we hit VISExitHalf. 194 */ 195 VISEntryHalf 196 197 /* Is 'dst' already aligned on an 64-byte boundary? */ 198 andcc %o0, 0x3f, %g2 199 be,pt %XCC, 2f 200 201 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 202 * of bytes to copy to make 'dst' 64-byte aligned. We pre- 203 * subtract this from 'len'. 204 */ 205 sub %o0, %o1, GLOBAL_SPARE 206 sub %g2, 0x40, %g2 207 sub %g0, %g2, %g2 208 sub %o2, %g2, %o2 209 andcc %g2, 0x7, %g1 210 be,pt %icc, 2f 211 and %g2, 0x38, %g2 212 2131: subcc %g1, 0x1, %g1 214 EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1) 215 EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1) 216 bgu,pt %XCC, 1b 217 add %o1, 0x1, %o1 218 219 add %o1, GLOBAL_SPARE, %o0 220 2212: cmp %g2, 0x0 222 and %o1, 0x7, %g1 223 be,pt %icc, 3f 224 alignaddr %o1, %g0, %o1 225 226 EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2) 2271: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2) 228 add %o1, 0x8, %o1 229 subcc %g2, 0x8, %g2 230 faligndata %f4, %f6, %f0 231 EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8) 232 be,pn %icc, 3f 233 add %o0, 0x8, %o0 234 235 EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2) 236 add %o1, 0x8, %o1 237 subcc %g2, 0x8, %g2 238 faligndata %f6, %f4, %f2 239 EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8) 240 bne,pt %icc, 1b 241 add %o0, 0x8, %o0 242 2433: LOAD(prefetch, %o1 + 0x000, #one_read) 244 LOAD(prefetch, %o1 + 0x040, #one_read) 245 andn %o2, (0x40 - 1), GLOBAL_SPARE 246 LOAD(prefetch, %o1 + 0x080, #one_read) 247 LOAD(prefetch, %o1 + 0x0c0, #one_read) 248 LOAD(prefetch, %o1 + 0x100, #one_read) 249 EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2) 250 LOAD(prefetch, %o1 + 0x140, #one_read) 251 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2) 252 LOAD(prefetch, %o1 + 0x180, #one_read) 253 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2) 254 LOAD(prefetch, %o1 + 0x1c0, #one_read) 255 faligndata %f0, %f2, %f16 256 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2) 257 faligndata %f2, %f4, %f18 258 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2) 259 faligndata %f4, %f6, %f20 260 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2) 261 faligndata %f6, %f8, %f22 262 263 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2) 264 faligndata %f8, %f10, %f24 265 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2) 266 faligndata %f10, %f12, %f26 267 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2) 268 269 subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE 270 add %o1, 0x40, %o1 271 bgu,pt %XCC, 1f 272 srl GLOBAL_SPARE, 6, %o3 273 ba,pt %xcc, 2f 274 nop 275 276 .align 64 2771: 278 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80) 279 faligndata %f12, %f14, %f28 280 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80) 281 faligndata %f14, %f0, %f30 282 EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 283 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40) 284 faligndata %f0, %f2, %f16 285 add %o0, 0x40, %o0 286 287 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40) 288 faligndata %f2, %f4, %f18 289 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40) 290 faligndata %f4, %f6, %f20 291 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40) 292 subcc %o3, 0x01, %o3 293 faligndata %f6, %f8, %f22 294 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80) 295 296 faligndata %f8, %f10, %f24 297 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 298 LOAD(prefetch, %o1 + 0x1c0, #one_read) 299 faligndata %f10, %f12, %f26 300 bg,pt %XCC, 1b 301 add %o1, 0x40, %o1 302 303 /* Finally we copy the last full 64-byte block. */ 3042: 305 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80) 306 faligndata %f12, %f14, %f28 307 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80) 308 faligndata %f14, %f0, %f30 309 EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 310 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40) 311 faligndata %f0, %f2, %f16 312 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40) 313 faligndata %f2, %f4, %f18 314 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40) 315 faligndata %f4, %f6, %f20 316 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40) 317 faligndata %f6, %f8, %f22 318 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40) 319 faligndata %f8, %f10, %f24 320 cmp %g1, 0 321 be,pt %XCC, 1f 322 add %o0, 0x40, %o0 323 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40) 3241: faligndata %f10, %f12, %f26 325 faligndata %f12, %f14, %f28 326 faligndata %f14, %f0, %f30 327 EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40) 328 add %o0, 0x40, %o0 329 add %o1, 0x40, %o1 330 membar #Sync 331 332 /* Now we copy the (len modulo 64) bytes at the end. 333 * Note how we borrow the %f0 loaded above. 334 * 335 * Also notice how this code is careful not to perform a 336 * load past the end of the src buffer. 337 */ 338 and %o2, 0x3f, %o2 339 andcc %o2, 0x38, %g2 340 be,pn %XCC, 2f 341 subcc %g2, 0x8, %g2 342 be,pn %XCC, 2f 343 cmp %g1, 0 344 345 sub %o2, %g2, %o2 346 be,a,pt %XCC, 1f 347 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2) 348 3491: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2) 350 add %o1, 0x8, %o1 351 subcc %g2, 0x8, %g2 352 faligndata %f0, %f2, %f8 353 EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8) 354 be,pn %XCC, 2f 355 add %o0, 0x8, %o0 356 EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2) 357 add %o1, 0x8, %o1 358 subcc %g2, 0x8, %g2 359 faligndata %f2, %f0, %f8 360 EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8) 361 bne,pn %XCC, 1b 362 add %o0, 0x8, %o0 363 364 /* If anything is left, we copy it one byte at a time. 365 * Note that %g1 is (src & 0x3) saved above before the 366 * alignaddr was performed. 367 */ 3682: 369 cmp %o2, 0 370 add %o1, %g1, %o1 371 VISExitHalf 372 be,pn %XCC, end_return 373 sub %o0, %o1, %o3 374 375 andcc %g1, 0x7, %g0 376 bne,pn %icc, 90f 377 andcc %o2, 0x8, %g0 378 be,pt %icc, 1f 379 nop 380 EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2) 381 EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2) 382 add %o1, 0x8, %o1 383 sub %o2, 8, %o2 384 3851: andcc %o2, 0x4, %g0 386 be,pt %icc, 1f 387 nop 388 EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2) 389 EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2) 390 add %o1, 0x4, %o1 391 sub %o2, 4, %o2 392 3931: andcc %o2, 0x2, %g0 394 be,pt %icc, 1f 395 nop 396 EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2) 397 EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2) 398 add %o1, 0x2, %o1 399 sub %o2, 2, %o2 400 4011: andcc %o2, 0x1, %g0 402 be,pt %icc, end_return 403 nop 404 EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2) 405 ba,pt %xcc, end_return 406 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2) 407 408 .align 64 409 /* 16 <= len < 192 */ 410less_than_192: 411 bne,pn %XCC, 75f 412 sub %o0, %o1, %o3 413 41472: 415 andn %o2, 0xf, GLOBAL_SPARE 416 and %o2, 0xf, %o2 4171: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE 418 EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10) 419 EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10) 420 EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10) 421 add %o1, 0x8, %o1 422 EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08) 423 bgu,pt %XCC, 1b 424 add %o1, 0x8, %o1 42573: andcc %o2, 0x8, %g0 426 be,pt %XCC, 1f 427 nop 428 sub %o2, 0x8, %o2 429 EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8) 430 EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8) 431 add %o1, 0x8, %o1 4321: andcc %o2, 0x4, %g0 433 be,pt %XCC, 1f 434 nop 435 sub %o2, 0x4, %o2 436 EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4) 437 EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4) 438 add %o1, 0x4, %o1 4391: cmp %o2, 0 440 be,pt %XCC, end_return 441 nop 442 ba,pt %xcc, 90f 443 nop 444 44575: 446 andcc %o0, 0x7, %g1 447 sub %g1, 0x8, %g1 448 be,pn %icc, 2f 449 sub %g0, %g1, %g1 450 sub %o2, %g1, %o2 451 4521: subcc %g1, 1, %g1 453 EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1) 454 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1) 455 bgu,pt %icc, 1b 456 add %o1, 1, %o1 457 4582: add %o1, %o3, %o0 459 andcc %o1, 0x7, %g1 460 bne,pt %icc, 8f 461 sll %g1, 3, %g1 462 463 cmp %o2, 16 464 bgeu,pt %icc, 72b 465 nop 466 ba,a,pt %xcc, 73b 467 4688: mov 64, %o3 469 andn %o1, 0x7, %o1 470 EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2) 471 sub %o3, %g1, %o3 472 andn %o2, 0x7, GLOBAL_SPARE 473 sllx %g2, %g1, %g2 4741: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS) 475 subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE 476 add %o1, 0x8, %o1 477 srlx %g3, %o3, %o5 478 or %o5, %g2, %o5 479 EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8) 480 add %o0, 0x8, %o0 481 bgu,pt %icc, 1b 482 sllx %g3, %g1, %g2 483 484 srl %g1, 3, %g1 485 andcc %o2, 0x7, %o2 486 be,pn %icc, end_return 487 add %o1, %g1, %o1 488 ba,pt %xcc, 90f 489 sub %o0, %o1, %o3 490 491 .align 64 492 /* 0 < len < 16 */ 493less_than_16: 494 andcc %o3, 0x3, %g0 495 bne,pn %XCC, 90f 496 sub %o0, %o1, %o3 497 4981: 499 subcc %o2, 4, %o2 500 EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4) 501 EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4) 502 bgu,pt %XCC, 1b 503 add %o1, 4, %o1 504 505end_return: 506 retl 507 mov EX_RETVAL(%o4), %o0 508 509 .align 32 51090: 511 subcc %o2, 1, %o2 512 EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1) 513 EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1) 514 bgu,pt %XCC, 90b 515 add %o1, 1, %o1 516 retl 517 mov EX_RETVAL(%o4), %o0 518 519 .size FUNC_NAME, .-FUNC_NAME 520