1/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. 2 * 3 * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) 4 * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) 5 */ 6 7#ifdef __KERNEL__ 8#include <asm/visasm.h> 9#include <asm/asi.h> 10#include <asm/export.h> 11#define GLOBAL_SPARE g7 12#else 13#define GLOBAL_SPARE g5 14#define ASI_BLK_P 0xf0 15#define FPRS_FEF 0x04 16#ifdef MEMCPY_DEBUG 17#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 18 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 19#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 20#else 21#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 22#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 23#endif 24#endif 25 26#ifndef EX_LD 27#define EX_LD(x) x 28#endif 29#ifndef EX_LD_FP 30#define EX_LD_FP(x) x 31#endif 32 33#ifndef EX_ST 34#define EX_ST(x) x 35#endif 36#ifndef EX_ST_FP 37#define EX_ST_FP(x) x 38#endif 39 40#ifndef EX_RETVAL 41#define EX_RETVAL(x) x 42#endif 43 44#ifndef LOAD 45#define LOAD(type,addr,dest) type [addr], dest 46#endif 47 48#ifndef LOAD_BLK 49#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest 50#endif 51 52#ifndef STORE 53#define STORE(type,src,addr) type src, [addr] 54#endif 55 56#ifndef STORE_BLK 57#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 58#endif 59 60#ifndef FUNC_NAME 61#define FUNC_NAME memcpy 62#endif 63 64#ifndef PREAMBLE 65#define PREAMBLE 66#endif 67 68#ifndef XCC 69#define XCC xcc 70#endif 71 72#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ 73 faligndata %f1, %f2, %f48; \ 74 faligndata %f2, %f3, %f50; \ 75 faligndata %f3, %f4, %f52; \ 76 faligndata %f4, %f5, %f54; \ 77 faligndata %f5, %f6, %f56; \ 78 faligndata %f6, %f7, %f58; \ 79 faligndata %f7, %f8, %f60; \ 80 faligndata %f8, %f9, %f62; 81 82#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ 83 EX_LD_FP(LOAD_BLK(%src, %fdest)); \ 84 EX_ST_FP(STORE_BLK(%fsrc, %dest)); \ 85 add %src, 0x40, %src; \ 86 subcc %len, 0x40, %len; \ 87 be,pn %xcc, jmptgt; \ 88 add %dest, 0x40, %dest; \ 89 90#define LOOP_CHUNK1(src, dest, len, branch_dest) \ 91 MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) 92#define LOOP_CHUNK2(src, dest, len, branch_dest) \ 93 MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) 94#define LOOP_CHUNK3(src, dest, len, branch_dest) \ 95 MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) 96 97#define DO_SYNC membar #Sync; 98#define STORE_SYNC(dest, fsrc) \ 99 EX_ST_FP(STORE_BLK(%fsrc, %dest)); \ 100 add %dest, 0x40, %dest; \ 101 DO_SYNC 102 103#define STORE_JUMP(dest, fsrc, target) \ 104 EX_ST_FP(STORE_BLK(%fsrc, %dest)); \ 105 add %dest, 0x40, %dest; \ 106 ba,pt %xcc, target; \ 107 nop; 108 109#define FINISH_VISCHUNK(dest, f0, f1, left) \ 110 subcc %left, 8, %left;\ 111 bl,pn %xcc, 95f; \ 112 faligndata %f0, %f1, %f48; \ 113 EX_ST_FP(STORE(std, %f48, %dest)); \ 114 add %dest, 8, %dest; 115 116#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ 117 subcc %left, 8, %left; \ 118 bl,pn %xcc, 95f; \ 119 fsrc2 %f0, %f1; 120 121#define UNEVEN_VISCHUNK(dest, f0, f1, left) \ 122 UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ 123 ba,a,pt %xcc, 93f; 124 125 .register %g2,#scratch 126 .register %g3,#scratch 127 128 .text 129 .align 64 130 131 .globl FUNC_NAME 132 .type FUNC_NAME,#function 133FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 134 srlx %o2, 31, %g2 135 cmp %g2, 0 136 tne %xcc, 5 137 PREAMBLE 138 mov %o0, %o4 139 cmp %o2, 0 140 be,pn %XCC, 85f 141 or %o0, %o1, %o3 142 cmp %o2, 16 143 blu,a,pn %XCC, 80f 144 or %o3, %o2, %o3 145 146 cmp %o2, (5 * 64) 147 blu,pt %XCC, 70f 148 andcc %o3, 0x7, %g0 149 150 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ 151 VISEntry 152 153 /* Is 'dst' already aligned on an 64-byte boundary? */ 154 andcc %o0, 0x3f, %g2 155 be,pt %XCC, 2f 156 157 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 158 * of bytes to copy to make 'dst' 64-byte aligned. We pre- 159 * subtract this from 'len'. 160 */ 161 sub %o0, %o1, %GLOBAL_SPARE 162 sub %g2, 0x40, %g2 163 sub %g0, %g2, %g2 164 sub %o2, %g2, %o2 165 andcc %g2, 0x7, %g1 166 be,pt %icc, 2f 167 and %g2, 0x38, %g2 168 1691: subcc %g1, 0x1, %g1 170 EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3)) 171 EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) 172 bgu,pt %XCC, 1b 173 add %o1, 0x1, %o1 174 175 add %o1, %GLOBAL_SPARE, %o0 176 1772: cmp %g2, 0x0 178 and %o1, 0x7, %g1 179 be,pt %icc, 3f 180 alignaddr %o1, %g0, %o1 181 182 EX_LD_FP(LOAD(ldd, %o1, %f4)) 1831: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6)) 184 add %o1, 0x8, %o1 185 subcc %g2, 0x8, %g2 186 faligndata %f4, %f6, %f0 187 EX_ST_FP(STORE(std, %f0, %o0)) 188 be,pn %icc, 3f 189 add %o0, 0x8, %o0 190 191 EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4)) 192 add %o1, 0x8, %o1 193 subcc %g2, 0x8, %g2 194 faligndata %f6, %f4, %f0 195 EX_ST_FP(STORE(std, %f0, %o0)) 196 bne,pt %icc, 1b 197 add %o0, 0x8, %o0 198 199 /* Destination is 64-byte aligned. */ 2003: 201 membar #LoadStore | #StoreStore | #StoreLoad 202 203 subcc %o2, 0x40, %GLOBAL_SPARE 204 add %o1, %g1, %g1 205 andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE 206 srl %g1, 3, %g2 207 sub %o2, %GLOBAL_SPARE, %g3 208 andn %o1, (0x40 - 1), %o1 209 and %g2, 7, %g2 210 andncc %g3, 0x7, %g3 211 fsrc2 %f0, %f2 212 sub %g3, 0x8, %g3 213 sub %o2, %GLOBAL_SPARE, %o2 214 215 add %g1, %GLOBAL_SPARE, %g1 216 subcc %o2, %g3, %o2 217 218 EX_LD_FP(LOAD_BLK(%o1, %f0)) 219 add %o1, 0x40, %o1 220 add %g1, %g3, %g1 221 EX_LD_FP(LOAD_BLK(%o1, %f16)) 222 add %o1, 0x40, %o1 223 sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE 224 EX_LD_FP(LOAD_BLK(%o1, %f32)) 225 add %o1, 0x40, %o1 226 227 /* There are 8 instances of the unrolled loop, 228 * one for each possible alignment of the 229 * source buffer. Each loop instance is 452 230 * bytes. 231 */ 232 sll %g2, 3, %o3 233 sub %o3, %g2, %o3 234 sllx %o3, 4, %o3 235 add %o3, %g2, %o3 236 sllx %o3, 2, %g2 2371: rd %pc, %o3 238 add %o3, %lo(1f - 1b), %o3 239 jmpl %o3 + %g2, %g0 240 nop 241 242 .align 64 2431: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 244 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 245 FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 246 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 247 FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 248 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 249 ba,pt %xcc, 1b+4 250 faligndata %f0, %f2, %f48 2511: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 252 STORE_SYNC(o0, f48) 253 FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 254 STORE_JUMP(o0, f48, 40f) 2552: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 256 STORE_SYNC(o0, f48) 257 FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 258 STORE_JUMP(o0, f48, 48f) 2593: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 260 STORE_SYNC(o0, f48) 261 FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 262 STORE_JUMP(o0, f48, 56f) 263 2641: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 265 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 266 FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 267 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 268 FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 269 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 270 ba,pt %xcc, 1b+4 271 faligndata %f2, %f4, %f48 2721: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 273 STORE_SYNC(o0, f48) 274 FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 275 STORE_JUMP(o0, f48, 41f) 2762: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 277 STORE_SYNC(o0, f48) 278 FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 279 STORE_JUMP(o0, f48, 49f) 2803: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 281 STORE_SYNC(o0, f48) 282 FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 283 STORE_JUMP(o0, f48, 57f) 284 2851: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 286 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 287 FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 288 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 289 FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 290 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 291 ba,pt %xcc, 1b+4 292 faligndata %f4, %f6, %f48 2931: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 294 STORE_SYNC(o0, f48) 295 FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 296 STORE_JUMP(o0, f48, 42f) 2972: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 298 STORE_SYNC(o0, f48) 299 FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 300 STORE_JUMP(o0, f48, 50f) 3013: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 302 STORE_SYNC(o0, f48) 303 FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 304 STORE_JUMP(o0, f48, 58f) 305 3061: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 307 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 308 FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 309 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 310 FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 311 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 312 ba,pt %xcc, 1b+4 313 faligndata %f6, %f8, %f48 3141: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 315 STORE_SYNC(o0, f48) 316 FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 317 STORE_JUMP(o0, f48, 43f) 3182: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 319 STORE_SYNC(o0, f48) 320 FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 321 STORE_JUMP(o0, f48, 51f) 3223: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 323 STORE_SYNC(o0, f48) 324 FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 325 STORE_JUMP(o0, f48, 59f) 326 3271: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 328 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 329 FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 330 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 331 FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 332 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 333 ba,pt %xcc, 1b+4 334 faligndata %f8, %f10, %f48 3351: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 336 STORE_SYNC(o0, f48) 337 FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 338 STORE_JUMP(o0, f48, 44f) 3392: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 340 STORE_SYNC(o0, f48) 341 FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 342 STORE_JUMP(o0, f48, 52f) 3433: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 344 STORE_SYNC(o0, f48) 345 FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 346 STORE_JUMP(o0, f48, 60f) 347 3481: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 349 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 350 FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 351 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 352 FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 353 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 354 ba,pt %xcc, 1b+4 355 faligndata %f10, %f12, %f48 3561: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 357 STORE_SYNC(o0, f48) 358 FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 359 STORE_JUMP(o0, f48, 45f) 3602: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 361 STORE_SYNC(o0, f48) 362 FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 363 STORE_JUMP(o0, f48, 53f) 3643: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 365 STORE_SYNC(o0, f48) 366 FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 367 STORE_JUMP(o0, f48, 61f) 368 3691: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 370 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 371 FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 372 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 373 FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 374 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 375 ba,pt %xcc, 1b+4 376 faligndata %f12, %f14, %f48 3771: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 378 STORE_SYNC(o0, f48) 379 FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 380 STORE_JUMP(o0, f48, 46f) 3812: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 382 STORE_SYNC(o0, f48) 383 FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 384 STORE_JUMP(o0, f48, 54f) 3853: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 386 STORE_SYNC(o0, f48) 387 FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 388 STORE_JUMP(o0, f48, 62f) 389 3901: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 391 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 392 FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 393 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 394 FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 395 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 396 ba,pt %xcc, 1b+4 397 faligndata %f14, %f16, %f48 3981: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 399 STORE_SYNC(o0, f48) 400 FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 401 STORE_JUMP(o0, f48, 47f) 4022: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 403 STORE_SYNC(o0, f48) 404 FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 405 STORE_JUMP(o0, f48, 55f) 4063: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 407 STORE_SYNC(o0, f48) 408 FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 409 STORE_JUMP(o0, f48, 63f) 410 41140: FINISH_VISCHUNK(o0, f0, f2, g3) 41241: FINISH_VISCHUNK(o0, f2, f4, g3) 41342: FINISH_VISCHUNK(o0, f4, f6, g3) 41443: FINISH_VISCHUNK(o0, f6, f8, g3) 41544: FINISH_VISCHUNK(o0, f8, f10, g3) 41645: FINISH_VISCHUNK(o0, f10, f12, g3) 41746: FINISH_VISCHUNK(o0, f12, f14, g3) 41847: UNEVEN_VISCHUNK(o0, f14, f0, g3) 41948: FINISH_VISCHUNK(o0, f16, f18, g3) 42049: FINISH_VISCHUNK(o0, f18, f20, g3) 42150: FINISH_VISCHUNK(o0, f20, f22, g3) 42251: FINISH_VISCHUNK(o0, f22, f24, g3) 42352: FINISH_VISCHUNK(o0, f24, f26, g3) 42453: FINISH_VISCHUNK(o0, f26, f28, g3) 42554: FINISH_VISCHUNK(o0, f28, f30, g3) 42655: UNEVEN_VISCHUNK(o0, f30, f0, g3) 42756: FINISH_VISCHUNK(o0, f32, f34, g3) 42857: FINISH_VISCHUNK(o0, f34, f36, g3) 42958: FINISH_VISCHUNK(o0, f36, f38, g3) 43059: FINISH_VISCHUNK(o0, f38, f40, g3) 43160: FINISH_VISCHUNK(o0, f40, f42, g3) 43261: FINISH_VISCHUNK(o0, f42, f44, g3) 43362: FINISH_VISCHUNK(o0, f44, f46, g3) 43463: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) 435 43693: EX_LD_FP(LOAD(ldd, %o1, %f2)) 437 add %o1, 8, %o1 438 subcc %g3, 8, %g3 439 faligndata %f0, %f2, %f8 440 EX_ST_FP(STORE(std, %f8, %o0)) 441 bl,pn %xcc, 95f 442 add %o0, 8, %o0 443 EX_LD_FP(LOAD(ldd, %o1, %f0)) 444 add %o1, 8, %o1 445 subcc %g3, 8, %g3 446 faligndata %f2, %f0, %f8 447 EX_ST_FP(STORE(std, %f8, %o0)) 448 bge,pt %xcc, 93b 449 add %o0, 8, %o0 450 45195: brz,pt %o2, 2f 452 mov %g1, %o1 453 4541: EX_LD_FP(LOAD(ldub, %o1, %o3)) 455 add %o1, 1, %o1 456 subcc %o2, 1, %o2 457 EX_ST_FP(STORE(stb, %o3, %o0)) 458 bne,pt %xcc, 1b 459 add %o0, 1, %o0 460 4612: membar #StoreLoad | #StoreStore 462 VISExit 463 retl 464 mov EX_RETVAL(%o4), %o0 465 466 .align 64 46770: /* 16 < len <= (5 * 64) */ 468 bne,pn %XCC, 75f 469 sub %o0, %o1, %o3 470 47172: andn %o2, 0xf, %GLOBAL_SPARE 472 and %o2, 0xf, %o2 4731: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) 474 EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) 475 subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE 476 EX_ST(STORE(stx, %o5, %o1 + %o3)) 477 add %o1, 0x8, %o1 478 EX_ST(STORE(stx, %g1, %o1 + %o3)) 479 bgu,pt %XCC, 1b 480 add %o1, 0x8, %o1 48173: andcc %o2, 0x8, %g0 482 be,pt %XCC, 1f 483 nop 484 EX_LD(LOAD(ldx, %o1, %o5)) 485 sub %o2, 0x8, %o2 486 EX_ST(STORE(stx, %o5, %o1 + %o3)) 487 add %o1, 0x8, %o1 4881: andcc %o2, 0x4, %g0 489 be,pt %XCC, 1f 490 nop 491 EX_LD(LOAD(lduw, %o1, %o5)) 492 sub %o2, 0x4, %o2 493 EX_ST(STORE(stw, %o5, %o1 + %o3)) 494 add %o1, 0x4, %o1 4951: cmp %o2, 0 496 be,pt %XCC, 85f 497 nop 498 ba,pt %xcc, 90f 499 nop 500 50175: andcc %o0, 0x7, %g1 502 sub %g1, 0x8, %g1 503 be,pn %icc, 2f 504 sub %g0, %g1, %g1 505 sub %o2, %g1, %o2 506 5071: EX_LD(LOAD(ldub, %o1, %o5)) 508 subcc %g1, 1, %g1 509 EX_ST(STORE(stb, %o5, %o1 + %o3)) 510 bgu,pt %icc, 1b 511 add %o1, 1, %o1 512 5132: add %o1, %o3, %o0 514 andcc %o1, 0x7, %g1 515 bne,pt %icc, 8f 516 sll %g1, 3, %g1 517 518 cmp %o2, 16 519 bgeu,pt %icc, 72b 520 nop 521 ba,a,pt %xcc, 73b 522 5238: mov 64, %o3 524 andn %o1, 0x7, %o1 525 EX_LD(LOAD(ldx, %o1, %g2)) 526 sub %o3, %g1, %o3 527 andn %o2, 0x7, %GLOBAL_SPARE 528 sllx %g2, %g1, %g2 5291: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) 530 subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE 531 add %o1, 0x8, %o1 532 srlx %g3, %o3, %o5 533 or %o5, %g2, %o5 534 EX_ST(STORE(stx, %o5, %o0)) 535 add %o0, 0x8, %o0 536 bgu,pt %icc, 1b 537 sllx %g3, %g1, %g2 538 539 srl %g1, 3, %g1 540 andcc %o2, 0x7, %o2 541 be,pn %icc, 85f 542 add %o1, %g1, %o1 543 ba,pt %xcc, 90f 544 sub %o0, %o1, %o3 545 546 .align 64 54780: /* 0 < len <= 16 */ 548 andcc %o3, 0x3, %g0 549 bne,pn %XCC, 90f 550 sub %o0, %o1, %o3 551 5521: EX_LD(LOAD(lduw, %o1, %g1)) 553 subcc %o2, 4, %o2 554 EX_ST(STORE(stw, %g1, %o1 + %o3)) 555 bgu,pt %XCC, 1b 556 add %o1, 4, %o1 557 55885: retl 559 mov EX_RETVAL(%o4), %o0 560 561 .align 32 56290: EX_LD(LOAD(ldub, %o1, %g1)) 563 subcc %o2, 1, %o2 564 EX_ST(STORE(stb, %g1, %o1 + %o3)) 565 bgu,pt %XCC, 90b 566 add %o1, 1, %o1 567 retl 568 mov EX_RETVAL(%o4), %o0 569 570 .size FUNC_NAME, .-FUNC_NAME 571EXPORT_SYMBOL(FUNC_NAME) 572