1/* U3memcpy.S: UltraSparc-III optimized memcpy. 2 * 3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) 4 */ 5 6#ifdef __KERNEL__ 7#include <linux/linkage.h> 8#include <asm/visasm.h> 9#include <asm/asi.h> 10#define GLOBAL_SPARE %g7 11#else 12#define ASI_BLK_P 0xf0 13#define FPRS_FEF 0x04 14#ifdef MEMCPY_DEBUG 15#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 16 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 17#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 18#else 19#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 20#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 21#endif 22#define GLOBAL_SPARE %g5 23#endif 24 25#ifndef EX_LD 26#define EX_LD(x,y) x 27#endif 28#ifndef EX_LD_FP 29#define EX_LD_FP(x,y) x 30#endif 31 32#ifndef EX_ST 33#define EX_ST(x,y) x 34#endif 35#ifndef EX_ST_FP 36#define EX_ST_FP(x,y) x 37#endif 38 39#ifndef LOAD 40#define LOAD(type,addr,dest) type [addr], dest 41#endif 42 43#ifndef STORE 44#define STORE(type,src,addr) type src, [addr] 45#endif 46 47#ifndef STORE_BLK 48#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 49#endif 50 51#ifndef FUNC_NAME 52#define FUNC_NAME U3memcpy 53#endif 54 55#ifndef PREAMBLE 56#define PREAMBLE 57#endif 58 59#ifndef XCC 60#define XCC xcc 61#endif 62 63 .register %g2,#scratch 64 .register %g3,#scratch 65 66 /* Special/non-trivial issues of this code: 67 * 68 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf 69 * 2) Only low 32 FPU registers are used so that only the 70 * lower half of the FPU register set is dirtied by this 71 * code. This is especially important in the kernel. 72 * 3) This code never prefetches cachelines past the end 73 * of the source buffer. 74 */ 75 76 .text 77#ifndef EX_RETVAL 78#define EX_RETVAL(x) x 79__restore_fp: 80 VISExitHalf 81 retl 82 nop 83ENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp) 84 add %g1, 1, %g1 85 add %g2, %g1, %g2 86 ba,pt %xcc, __restore_fp 87 add %o2, %g2, %o0 88ENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp) 89ENTRY(U3_retl_o2_plus_g2_fp) 90 ba,pt %xcc, __restore_fp 91 add %o2, %g2, %o0 92ENDPROC(U3_retl_o2_plus_g2_fp) 93ENTRY(U3_retl_o2_plus_g2_plus_8_fp) 94 add %g2, 8, %g2 95 ba,pt %xcc, __restore_fp 96 add %o2, %g2, %o0 97ENDPROC(U3_retl_o2_plus_g2_plus_8_fp) 98ENTRY(U3_retl_o2) 99 retl 100 mov %o2, %o0 101ENDPROC(U3_retl_o2) 102ENTRY(U3_retl_o2_plus_1) 103 retl 104 add %o2, 1, %o0 105ENDPROC(U3_retl_o2_plus_1) 106ENTRY(U3_retl_o2_plus_4) 107 retl 108 add %o2, 4, %o0 109ENDPROC(U3_retl_o2_plus_4) 110ENTRY(U3_retl_o2_plus_8) 111 retl 112 add %o2, 8, %o0 113ENDPROC(U3_retl_o2_plus_8) 114ENTRY(U3_retl_o2_plus_g1_plus_1) 115 add %g1, 1, %g1 116 retl 117 add %o2, %g1, %o0 118ENDPROC(U3_retl_o2_plus_g1_plus_1) 119ENTRY(U3_retl_o2_fp) 120 ba,pt %xcc, __restore_fp 121 mov %o2, %o0 122ENDPROC(U3_retl_o2_fp) 123ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp) 124 sll %o3, 6, %o3 125 add %o3, 0x80, %o3 126 ba,pt %xcc, __restore_fp 127 add %o2, %o3, %o0 128ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp) 129ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp) 130 sll %o3, 6, %o3 131 add %o3, 0x40, %o3 132 ba,pt %xcc, __restore_fp 133 add %o2, %o3, %o0 134ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp) 135ENTRY(U3_retl_o2_plus_GS_plus_0x10) 136 add GLOBAL_SPARE, 0x10, GLOBAL_SPARE 137 retl 138 add %o2, GLOBAL_SPARE, %o0 139ENDPROC(U3_retl_o2_plus_GS_plus_0x10) 140ENTRY(U3_retl_o2_plus_GS_plus_0x08) 141 add GLOBAL_SPARE, 0x08, GLOBAL_SPARE 142 retl 143 add %o2, GLOBAL_SPARE, %o0 144ENDPROC(U3_retl_o2_plus_GS_plus_0x08) 145ENTRY(U3_retl_o2_and_7_plus_GS) 146 and %o2, 7, %o2 147 retl 148 add %o2, GLOBAL_SPARE, %o2 149ENDPROC(U3_retl_o2_and_7_plus_GS) 150ENTRY(U3_retl_o2_and_7_plus_GS_plus_8) 151 add GLOBAL_SPARE, 8, GLOBAL_SPARE 152 and %o2, 7, %o2 153 retl 154 add %o2, GLOBAL_SPARE, %o2 155ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8) 156#endif 157 158 .align 64 159 160 /* The cheetah's flexible spine, oversized liver, enlarged heart, 161 * slender muscular body, and claws make it the swiftest hunter 162 * in Africa and the fastest animal on land. Can reach speeds 163 * of up to 2.4GB per second. 164 */ 165 166 .globl FUNC_NAME 167 .type FUNC_NAME,#function 168FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 169 srlx %o2, 31, %g2 170 cmp %g2, 0 171 tne %xcc, 5 172 PREAMBLE 173 mov %o0, %o4 174 cmp %o2, 0 175 be,pn %XCC, 85f 176 or %o0, %o1, %o3 177 cmp %o2, 16 178 blu,a,pn %XCC, 80f 179 or %o3, %o2, %o3 180 181 cmp %o2, (3 * 64) 182 blu,pt %XCC, 70f 183 andcc %o3, 0x7, %g0 184 185 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve 186 * o5 from here until we hit VISExitHalf. 187 */ 188 VISEntryHalf 189 190 /* Is 'dst' already aligned on an 64-byte boundary? */ 191 andcc %o0, 0x3f, %g2 192 be,pt %XCC, 2f 193 194 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 195 * of bytes to copy to make 'dst' 64-byte aligned. We pre- 196 * subtract this from 'len'. 197 */ 198 sub %o0, %o1, GLOBAL_SPARE 199 sub %g2, 0x40, %g2 200 sub %g0, %g2, %g2 201 sub %o2, %g2, %o2 202 andcc %g2, 0x7, %g1 203 be,pt %icc, 2f 204 and %g2, 0x38, %g2 205 2061: subcc %g1, 0x1, %g1 207 EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1) 208 EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1) 209 bgu,pt %XCC, 1b 210 add %o1, 0x1, %o1 211 212 add %o1, GLOBAL_SPARE, %o0 213 2142: cmp %g2, 0x0 215 and %o1, 0x7, %g1 216 be,pt %icc, 3f 217 alignaddr %o1, %g0, %o1 218 219 EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2) 2201: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2) 221 add %o1, 0x8, %o1 222 subcc %g2, 0x8, %g2 223 faligndata %f4, %f6, %f0 224 EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8) 225 be,pn %icc, 3f 226 add %o0, 0x8, %o0 227 228 EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2) 229 add %o1, 0x8, %o1 230 subcc %g2, 0x8, %g2 231 faligndata %f6, %f4, %f2 232 EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8) 233 bne,pt %icc, 1b 234 add %o0, 0x8, %o0 235 2363: LOAD(prefetch, %o1 + 0x000, #one_read) 237 LOAD(prefetch, %o1 + 0x040, #one_read) 238 andn %o2, (0x40 - 1), GLOBAL_SPARE 239 LOAD(prefetch, %o1 + 0x080, #one_read) 240 LOAD(prefetch, %o1 + 0x0c0, #one_read) 241 LOAD(prefetch, %o1 + 0x100, #one_read) 242 EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2) 243 LOAD(prefetch, %o1 + 0x140, #one_read) 244 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2) 245 LOAD(prefetch, %o1 + 0x180, #one_read) 246 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2) 247 LOAD(prefetch, %o1 + 0x1c0, #one_read) 248 faligndata %f0, %f2, %f16 249 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2) 250 faligndata %f2, %f4, %f18 251 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2) 252 faligndata %f4, %f6, %f20 253 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2) 254 faligndata %f6, %f8, %f22 255 256 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2) 257 faligndata %f8, %f10, %f24 258 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2) 259 faligndata %f10, %f12, %f26 260 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2) 261 262 subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE 263 add %o1, 0x40, %o1 264 bgu,pt %XCC, 1f 265 srl GLOBAL_SPARE, 6, %o3 266 ba,pt %xcc, 2f 267 nop 268 269 .align 64 2701: 271 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80) 272 faligndata %f12, %f14, %f28 273 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80) 274 faligndata %f14, %f0, %f30 275 EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 276 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40) 277 faligndata %f0, %f2, %f16 278 add %o0, 0x40, %o0 279 280 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40) 281 faligndata %f2, %f4, %f18 282 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40) 283 faligndata %f4, %f6, %f20 284 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40) 285 subcc %o3, 0x01, %o3 286 faligndata %f6, %f8, %f22 287 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80) 288 289 faligndata %f8, %f10, %f24 290 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 291 LOAD(prefetch, %o1 + 0x1c0, #one_read) 292 faligndata %f10, %f12, %f26 293 bg,pt %XCC, 1b 294 add %o1, 0x40, %o1 295 296 /* Finally we copy the last full 64-byte block. */ 2972: 298 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80) 299 faligndata %f12, %f14, %f28 300 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80) 301 faligndata %f14, %f0, %f30 302 EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80) 303 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40) 304 faligndata %f0, %f2, %f16 305 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40) 306 faligndata %f2, %f4, %f18 307 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40) 308 faligndata %f4, %f6, %f20 309 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40) 310 faligndata %f6, %f8, %f22 311 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40) 312 faligndata %f8, %f10, %f24 313 cmp %g1, 0 314 be,pt %XCC, 1f 315 add %o0, 0x40, %o0 316 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40) 3171: faligndata %f10, %f12, %f26 318 faligndata %f12, %f14, %f28 319 faligndata %f14, %f0, %f30 320 EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40) 321 add %o0, 0x40, %o0 322 add %o1, 0x40, %o1 323 membar #Sync 324 325 /* Now we copy the (len modulo 64) bytes at the end. 326 * Note how we borrow the %f0 loaded above. 327 * 328 * Also notice how this code is careful not to perform a 329 * load past the end of the src buffer. 330 */ 331 and %o2, 0x3f, %o2 332 andcc %o2, 0x38, %g2 333 be,pn %XCC, 2f 334 subcc %g2, 0x8, %g2 335 be,pn %XCC, 2f 336 cmp %g1, 0 337 338 sub %o2, %g2, %o2 339 be,a,pt %XCC, 1f 340 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2) 341 3421: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2) 343 add %o1, 0x8, %o1 344 subcc %g2, 0x8, %g2 345 faligndata %f0, %f2, %f8 346 EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8) 347 be,pn %XCC, 2f 348 add %o0, 0x8, %o0 349 EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2) 350 add %o1, 0x8, %o1 351 subcc %g2, 0x8, %g2 352 faligndata %f2, %f0, %f8 353 EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8) 354 bne,pn %XCC, 1b 355 add %o0, 0x8, %o0 356 357 /* If anything is left, we copy it one byte at a time. 358 * Note that %g1 is (src & 0x3) saved above before the 359 * alignaddr was performed. 360 */ 3612: 362 cmp %o2, 0 363 add %o1, %g1, %o1 364 VISExitHalf 365 be,pn %XCC, 85f 366 sub %o0, %o1, %o3 367 368 andcc %g1, 0x7, %g0 369 bne,pn %icc, 90f 370 andcc %o2, 0x8, %g0 371 be,pt %icc, 1f 372 nop 373 EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2) 374 EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2) 375 add %o1, 0x8, %o1 376 sub %o2, 8, %o2 377 3781: andcc %o2, 0x4, %g0 379 be,pt %icc, 1f 380 nop 381 EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2) 382 EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2) 383 add %o1, 0x4, %o1 384 sub %o2, 4, %o2 385 3861: andcc %o2, 0x2, %g0 387 be,pt %icc, 1f 388 nop 389 EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2) 390 EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2) 391 add %o1, 0x2, %o1 392 sub %o2, 2, %o2 393 3941: andcc %o2, 0x1, %g0 395 be,pt %icc, 85f 396 nop 397 EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2) 398 ba,pt %xcc, 85f 399 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2) 400 401 .align 64 40270: /* 16 < len <= 64 */ 403 bne,pn %XCC, 75f 404 sub %o0, %o1, %o3 405 40672: 407 andn %o2, 0xf, GLOBAL_SPARE 408 and %o2, 0xf, %o2 4091: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE 410 EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10) 411 EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10) 412 EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10) 413 add %o1, 0x8, %o1 414 EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08) 415 bgu,pt %XCC, 1b 416 add %o1, 0x8, %o1 41773: andcc %o2, 0x8, %g0 418 be,pt %XCC, 1f 419 nop 420 sub %o2, 0x8, %o2 421 EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8) 422 EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8) 423 add %o1, 0x8, %o1 4241: andcc %o2, 0x4, %g0 425 be,pt %XCC, 1f 426 nop 427 sub %o2, 0x4, %o2 428 EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4) 429 EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4) 430 add %o1, 0x4, %o1 4311: cmp %o2, 0 432 be,pt %XCC, 85f 433 nop 434 ba,pt %xcc, 90f 435 nop 436 43775: 438 andcc %o0, 0x7, %g1 439 sub %g1, 0x8, %g1 440 be,pn %icc, 2f 441 sub %g0, %g1, %g1 442 sub %o2, %g1, %o2 443 4441: subcc %g1, 1, %g1 445 EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1) 446 EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1) 447 bgu,pt %icc, 1b 448 add %o1, 1, %o1 449 4502: add %o1, %o3, %o0 451 andcc %o1, 0x7, %g1 452 bne,pt %icc, 8f 453 sll %g1, 3, %g1 454 455 cmp %o2, 16 456 bgeu,pt %icc, 72b 457 nop 458 ba,a,pt %xcc, 73b 459 4608: mov 64, %o3 461 andn %o1, 0x7, %o1 462 EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2) 463 sub %o3, %g1, %o3 464 andn %o2, 0x7, GLOBAL_SPARE 465 sllx %g2, %g1, %g2 4661: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS) 467 subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE 468 add %o1, 0x8, %o1 469 srlx %g3, %o3, %o5 470 or %o5, %g2, %o5 471 EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8) 472 add %o0, 0x8, %o0 473 bgu,pt %icc, 1b 474 sllx %g3, %g1, %g2 475 476 srl %g1, 3, %g1 477 andcc %o2, 0x7, %o2 478 be,pn %icc, 85f 479 add %o1, %g1, %o1 480 ba,pt %xcc, 90f 481 sub %o0, %o1, %o3 482 483 .align 64 48480: /* 0 < len <= 16 */ 485 andcc %o3, 0x3, %g0 486 bne,pn %XCC, 90f 487 sub %o0, %o1, %o3 488 4891: 490 subcc %o2, 4, %o2 491 EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4) 492 EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4) 493 bgu,pt %XCC, 1b 494 add %o1, 4, %o1 495 49685: retl 497 mov EX_RETVAL(%o4), %o0 498 499 .align 32 50090: 501 subcc %o2, 1, %o2 502 EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1) 503 EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1) 504 bgu,pt %XCC, 90b 505 add %o1, 1, %o1 506 retl 507 mov EX_RETVAL(%o4), %o0 508 509 .size FUNC_NAME, .-FUNC_NAME 510