1/* 2 * M7memcpy: Optimized SPARC M7 memcpy 3 * 4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 */ 6 7 .file "M7memcpy.S" 8 9/* 10 * memcpy(s1, s2, len) 11 * 12 * Copy s2 to s1, always copy n bytes. 13 * Note: this C code does not work for overlapped copies. 14 * 15 * Fast assembler language version of the following C-program for memcpy 16 * which represents the `standard' for the C-library. 17 * 18 * void * 19 * memcpy(void *s, const void *s0, size_t n) 20 * { 21 * if (n != 0) { 22 * char *s1 = s; 23 * const char *s2 = s0; 24 * do { 25 * *s1++ = *s2++; 26 * } while (--n != 0); 27 * } 28 * return (s); 29 * } 30 * 31 * 32 * SPARC T7/M7 Flow : 33 * 34 * if (count < SMALL_MAX) { 35 * if count < SHORTCOPY (SHORTCOPY=3) 36 * copy bytes; exit with dst addr 37 * if src & dst aligned on word boundary but not long word boundary, 38 * copy with ldw/stw; branch to finish_up 39 * if src & dst aligned on long word boundary 40 * copy with ldx/stx; branch to finish_up 41 * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) 42 * copy bytes; exit with dst addr 43 * move enough bytes to get src to word boundary 44 * if dst now on word boundary 45 * move_words: 46 * copy words; branch to finish_up 47 * if dst now on half word boundary 48 * load words, shift half words, store words; branch to finish_up 49 * if dst on byte 1 50 * load words, shift 3 bytes, store words; branch to finish_up 51 * if dst on byte 3 52 * load words, shift 1 byte, store words; branch to finish_up 53 * finish_up: 54 * copy bytes; exit with dst addr 55 * } else { More than SMALL_MAX bytes 56 * move bytes until dst is on long word boundary 57 * if( src is on long word boundary ) { 58 * if (count < MED_MAX) { 59 * finish_long: src/dst aligned on 8 bytes 60 * copy with ldx/stx in 8-way unrolled loop; 61 * copy final 0-63 bytes; exit with dst addr 62 * } else { src/dst aligned; count > MED_MAX 63 * align dst on 64 byte boundary; for main data movement: 64 * prefetch src data to L2 cache; let HW prefetch move data to L1 cache 65 * Use BIS (block initializing store) to avoid copying store cache 66 * lines from memory. But pre-store first element of each cache line 67 * ST_CHUNK lines in advance of the rest of that cache line. That 68 * gives time for replacement cache lines to be written back without 69 * excess STQ and Miss Buffer filling. Repeat until near the end, 70 * then finish up storing before going to finish_long. 71 * } 72 * } else { src/dst not aligned on 8 bytes 73 * if src is word aligned and count < MED_WMAX 74 * move words in 8-way unrolled loop 75 * move final 0-31 bytes; exit with dst addr 76 * if count < MED_UMAX 77 * use alignaddr/faligndata combined with ldd/std in 8-way 78 * unrolled loop to move data. 79 * go to unalign_done 80 * else 81 * setup alignaddr for faligndata instructions 82 * align dst on 64 byte boundary; prefetch src data to L1 cache 83 * loadx8, falign, block-store, prefetch loop 84 * (only use block-init-store when src/dst on 8 byte boundaries.) 85 * unalign_done: 86 * move remaining bytes for unaligned cases. exit with dst addr. 87 * } 88 * 89 */ 90 91#include <asm/visasm.h> 92#include <asm/asi.h> 93 94#if !defined(EX_LD) && !defined(EX_ST) 95#define NON_USER_COPY 96#endif 97 98#ifndef EX_LD 99#define EX_LD(x,y) x 100#endif 101#ifndef EX_LD_FP 102#define EX_LD_FP(x,y) x 103#endif 104 105#ifndef EX_ST 106#define EX_ST(x,y) x 107#endif 108#ifndef EX_ST_FP 109#define EX_ST_FP(x,y) x 110#endif 111 112#ifndef EX_RETVAL 113#define EX_RETVAL(x) x 114#endif 115 116#ifndef LOAD 117#define LOAD(type,addr,dest) type [addr], dest 118#endif 119 120#ifndef STORE 121#define STORE(type,src,addr) type src, [addr] 122#endif 123 124/* 125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache 126 * line as "least recently used" which means if many threads are 127 * active, it has a high probability of being pushed out of the cache 128 * between the first initializing store and the final stores. 129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which 130 * marks the cache line as "most recently used" for all 131 * but the last cache line 132 */ 133#ifndef STORE_ASI 134#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 135#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 136#else 137#define STORE_ASI 0x80 /* ASI_P */ 138#endif 139#endif 140 141#ifndef STORE_MRU_ASI 142#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 143#define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 144#else 145#define STORE_MRU_ASI 0x80 /* ASI_P */ 146#endif 147#endif 148 149#ifndef STORE_INIT 150#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 151#endif 152 153#ifndef STORE_INIT_MRU 154#define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI 155#endif 156 157#ifndef FUNC_NAME 158#define FUNC_NAME M7memcpy 159#endif 160 161#ifndef PREAMBLE 162#define PREAMBLE 163#endif 164 165#define BLOCK_SIZE 64 166#define SHORTCOPY 3 167#define SHORTCHECK 14 168#define SHORT_LONG 64 /* max copy for short longword-aligned case */ 169 /* must be at least 64 */ 170#define SMALL_MAX 128 171#define MED_UMAX 1024 /* max copy for medium un-aligned case */ 172#define MED_WMAX 1024 /* max copy for medium word-aligned case */ 173#define MED_MAX 1024 /* max copy for medium longword-aligned case */ 174#define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ 175#define ALIGN_PRE 24 /* distance for aligned prefetch loop */ 176 177 .register %g2,#scratch 178 179 .section ".text" 180 .global FUNC_NAME 181 .type FUNC_NAME, #function 182 .align 16 183FUNC_NAME: 184 srlx %o2, 31, %g2 185 cmp %g2, 0 186 tne %xcc, 5 187 PREAMBLE 188 mov %o0, %g1 ! save %o0 189 brz,pn %o2, .Lsmallx 190 cmp %o2, 3 191 ble,pn %icc, .Ltiny_cp 192 cmp %o2, 19 193 ble,pn %icc, .Lsmall_cp 194 or %o0, %o1, %g2 195 cmp %o2, SMALL_MAX 196 bl,pn %icc, .Lmedium_cp 197 nop 198 199.Lmedium: 200 neg %o0, %o5 201 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 202 brz,pt %o5, .Ldst_aligned_on_8 203 204 ! %o5 has the bytes to be written in partial store. 205 sub %o2, %o5, %o2 206 sub %o1, %o0, %o1 ! %o1 gets the difference 2077: ! dst aligning loop 208 add %o1, %o0, %o4 209 EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte 210 subcc %o5, 1, %o5 211 EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1) 212 bgu,pt %xcc, 7b 213 add %o0, 1, %o0 ! advance dst 214 add %o1, %o0, %o1 ! restore %o1 215.Ldst_aligned_on_8: 216 andcc %o1, 7, %o5 217 brnz,pt %o5, .Lsrc_dst_unaligned_on_8 218 nop 219 220.Lsrc_dst_aligned_on_8: 221 ! check if we are copying MED_MAX or more bytes 222 set MED_MAX, %o3 223 cmp %o2, %o3 ! limit to store buffer size 224 bgu,pn %xcc, .Llarge_align8_copy 225 nop 226 227/* 228 * Special case for handling when src and dest are both long word aligned 229 * and total data to move is less than MED_MAX bytes 230 */ 231.Lmedlong: 232 subcc %o2, 63, %o2 ! adjust length to allow cc test 233 ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes 234 nop 235.Lmedl64: 236 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load 237 subcc %o2, 64, %o2 ! decrement length count 238 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store 239 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64 240 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56) 241 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48) 242 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48) 243 EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40) 244 EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40) 245 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store 246 EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32) 247 EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64 248 add %o1, 64, %o1 ! increase src ptr by 64 249 EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24) 250 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16) 251 add %o0, 64, %o0 ! increase dst ptr by 64 252 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16) 253 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8) 254 bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left 255 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8) 256.Lmedl63: 257 addcc %o2, 32, %o2 ! adjust remaining count 258 ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left 259 nop 260 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load 261 sub %o2, 32, %o2 ! decrement length count 262 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store 263 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32 264 add %o1, 32, %o1 ! increase src ptr by 32 265 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24) 266 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 267 add %o0, 32, %o0 ! increase dst ptr by 32 268 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16) 269 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8) 270 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8) 271.Lmedl31: 272 addcc %o2, 16, %o2 ! adjust remaining count 273 ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left 274 nop ! 275 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15) 276 add %o1, 16, %o1 ! increase src ptr by 16 277 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15) 278 sub %o2, 16, %o2 ! decrease count by 16 279 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8) 280 add %o0, 16, %o0 ! increase dst ptr by 16 281 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8) 282.Lmedl15: 283 addcc %o2, 15, %o2 ! restore count 284 bz,pt %xcc, .Lsmallx ! exit if finished 285 cmp %o2, 8 286 blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 287 tst %o2 288 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes 289 add %o1, 8, %o1 ! increase src ptr by 8 290 add %o0, 8, %o0 ! increase dst ptr by 8 291 subcc %o2, 8, %o2 ! decrease count by 8 292 bnz,pn %xcc, .Lmedw7 293 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8 294 retl 295 mov EX_RETVAL(%g1), %o0 ! restore %o0 296 297 .align 16 298.Lsrc_dst_unaligned_on_8: 299 ! DST is 8-byte aligned, src is not 3002: 301 andcc %o1, 0x3, %o5 ! test word alignment 302 bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned 303 nop 304 305/* 306 * Handle all cases where src and dest are aligned on word 307 * boundaries. Use unrolled loops for better performance. 308 * This option wins over standard large data move when 309 * source and destination is in cache for.Lmedium 310 * to short data moves. 311 */ 312 set MED_WMAX, %o3 313 cmp %o2, %o3 ! limit to store buffer size 314 bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop 315 nop 316 317 subcc %o2, 31, %o2 ! adjust length to allow cc test 318 ! for end of loop 319 ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 320.Lmedw32: 321 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32 322 sllx %o4, 32, %o5 323 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31) 324 or %o4, %o5, %o5 325 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31) 326 subcc %o2, 32, %o2 ! decrement length count 327 EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24) 328 sllx %o4, 32, %o5 329 EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24) 330 or %o4, %o5, %o5 331 EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24) 332 add %o1, 32, %o1 ! increase src ptr by 32 333 EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 334 sllx %o4, 32, %o5 335 EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16) 336 or %o4, %o5, %o5 337 EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16) 338 add %o0, 32, %o0 ! increase dst ptr by 32 339 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8) 340 sllx %o4, 32, %o5 341 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8) 342 or %o4, %o5, %o5 343 bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left 344 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8) 345.Lmedw31: 346 addcc %o2, 31, %o2 ! restore count 347 348 bz,pt %xcc, .Lsmallx ! exit if finished 349 nop 350 cmp %o2, 16 351 blt,pt %xcc, .Lmedw15 352 nop 353 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes 354 sllx %o4, 32, %o5 355 subcc %o2, 16, %o2 ! decrement length count 356 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16) 357 or %o4, %o5, %o5 358 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16) 359 add %o1, 16, %o1 ! increase src ptr by 16 360 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8) 361 add %o0, 16, %o0 ! increase dst ptr by 16 362 sllx %o4, 32, %o5 363 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8) 364 or %o4, %o5, %o5 365 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8) 366.Lmedw15: 367 bz,pt %xcc, .Lsmallx ! exit if finished 368 cmp %o2, 8 369 blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 370 tst %o2 371 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 372 subcc %o2, 8, %o2 ! decrease count by 8 373 EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes 374 add %o1, 8, %o1 ! increase src ptr by 8 375 EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes 376 add %o0, 8, %o0 ! increase dst ptr by 8 377 EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 378 bz,pt %xcc, .Lsmallx ! exit if finished 379.Lmedw7: ! count is ge 1, less than 8 380 cmp %o2, 4 ! check for 4 bytes left 381 blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left 382 nop ! 383 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 384 add %o1, 4, %o1 ! increase src ptr by 4 385 add %o0, 4, %o0 ! increase dst ptr by 4 386 subcc %o2, 4, %o2 ! decrease count by 4 387 bnz .Lsmallleft3 388 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 389 retl 390 mov EX_RETVAL(%g1), %o0 391 392 .align 16 393.Llarge_align8_copy: ! Src and dst share 8 byte alignment 394 ! align dst to 64 byte boundary 395 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 396 brz,pn %o3, .Laligned_to_64 397 andcc %o0, 8, %o3 ! odd long words to move? 398 brz,pt %o3, .Laligned_to_16 399 nop 400 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 401 sub %o2, 8, %o2 402 add %o1, 8, %o1 ! increment src ptr 403 add %o0, 8, %o0 ! increment dst ptr 404 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 405.Laligned_to_16: 406 andcc %o0, 16, %o3 ! pair of long words to move? 407 brz,pt %o3, .Laligned_to_32 408 nop 409 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 410 sub %o2, 16, %o2 411 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16) 412 add %o1, 16, %o1 ! increment src ptr 413 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 414 add %o0, 16, %o0 ! increment dst ptr 415 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 416.Laligned_to_32: 417 andcc %o0, 32, %o3 ! four long words to move? 418 brz,pt %o3, .Laligned_to_64 419 nop 420 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 421 sub %o2, 32, %o2 422 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32) 423 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24) 424 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24) 425 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16) 426 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16) 427 add %o1, 32, %o1 ! increment src ptr 428 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 429 add %o0, 32, %o0 ! increment dst ptr 430 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 431.Laligned_to_64: 432! 433! Using block init store (BIS) instructions to avoid fetching cache 434! lines from memory. Use ST_CHUNK stores to first element of each cache 435! line (similar to prefetching) to avoid overfilling STQ or miss buffers. 436! Gives existing cache lines time to be moved out of L1/L2/L3 cache. 437! Initial stores using MRU version of BIS to keep cache line in 438! cache until we are ready to store final element of cache line. 439! Then store last element using the LRU version of BIS. 440! 441 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 442 and %o2, 0x3f, %o2 ! residue bytes in %o2 443! 444! We use STORE_MRU_ASI for the first seven stores to each cache line 445! followed by STORE_ASI (mark as LRU) for the last store. That 446! mixed approach reduces the probability that the cache line is removed 447! before we finish setting it, while minimizing the effects on 448! other cached values during a large memcpy 449! 450! ST_CHUNK batches up initial BIS operations for several cache lines 451! to allow multiple requests to not be blocked by overflowing the 452! the store miss buffer. Then the matching stores for all those 453! BIS operations are executed. 454! 455 456 sub %o0, 8, %o0 ! adjust %o0 for ASI alignment 457.Lalign_loop: 458 cmp %o5, ST_CHUNK*64 459 blu,pt %xcc, .Lalign_loop_fin 460 mov ST_CHUNK,%o3 461.Lalign_loop_start: 462 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 463 subcc %o3, 1, %o3 464 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 465 add %o1, 64, %o1 466 add %o0, 8, %o0 467 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 468 bgu %xcc,.Lalign_loop_start 469 add %o0, 56, %o0 470 471 mov ST_CHUNK,%o3 472 sllx %o3, 6, %o4 ! ST_CHUNK*64 473 sub %o1, %o4, %o1 ! reset %o1 474 sub %o0, %o4, %o0 ! reset %o0 475 476.Lalign_loop_rest: 477 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 478 add %o0, 16, %o0 479 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 480 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 481 add %o0, 8, %o0 482 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 483 subcc %o3, 1, %o3 484 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5) 485 add %o0, 8, %o0 486 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 487 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5) 488 add %o0, 8, %o0 489 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 490 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5) 491 add %o0, 8, %o0 492 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 493 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5) 494 add %o1, 64, %o1 495 add %o0, 8, %o0 496 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 497 add %o0, 8, %o0 498 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5) 499 sub %o5, 64, %o5 500 bgu %xcc,.Lalign_loop_rest 501 ! mark cache line as LRU 502 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64) 503 504 cmp %o5, ST_CHUNK*64 505 bgu,pt %xcc, .Lalign_loop_start 506 mov ST_CHUNK,%o3 507 508 cmp %o5, 0 509 beq .Lalign_done 510 nop 511.Lalign_loop_fin: 512 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 513 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5) 514 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 515 EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5) 516 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 517 EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5) 518 subcc %o5, 64, %o5 519 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64) 520 EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64) 521 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64) 522 EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64) 523 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64) 524 EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64) 525 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64) 526 add %o1, 64, %o1 527 EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64) 528 add %o0, 64, %o0 529 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64) 530 bgu %xcc,.Lalign_loop_fin 531 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64) 532 533.Lalign_done: 534 add %o0, 8, %o0 ! restore %o0 from ASI alignment 535 membar #StoreStore 536 sub %o2, 63, %o2 ! adjust length to allow cc test 537 ba .Lmedl63 ! in .Lmedl63 538 nop 539 540 .align 16 541 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 542.Lunalignsetup: 543.Lunalignrejoin: 544 mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it 545#ifdef NON_USER_COPY 546 VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) 547#else 548 VISEntryHalf 549#endif 550 mov %o3, %g1 ! restore %g1 551 552 set MED_UMAX, %o3 553 cmp %o2, %o3 ! check for.Lmedium unaligned limit 554 bge,pt %xcc,.Lunalign_large 555 prefetch [%o1 + (4 * BLOCK_SIZE)], 20 556 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 557 and %o2, 0x3f, %o2 ! residue bytes in %o2 558 cmp %o2, 8 ! Insure we do not load beyond 559 bgt .Lunalign_adjust ! end of source buffer 560 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 561 add %o2, 64, %o2 ! adjust to leave loop 562 sub %o5, 64, %o5 ! early if necessary 563.Lunalign_adjust: 564 alignaddr %o1, %g0, %g0 ! generate %gsr 565 add %o1, %o5, %o1 ! advance %o1 to after blocks 566 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5) 567.Lunalign_loop: 568 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 569 faligndata %f0, %f2, %f16 570 EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5) 571 subcc %o5, BLOCK_SIZE, %o5 572 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64) 573 faligndata %f2, %f4, %f18 574 EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56) 575 EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 576 faligndata %f4, %f6, %f20 577 EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48) 578 EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 579 faligndata %f6, %f8, %f22 580 EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40) 581 EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 582 faligndata %f8, %f10, %f24 583 EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32) 584 EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32) 585 faligndata %f10, %f12, %f26 586 EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24) 587 add %o4, BLOCK_SIZE, %o4 588 EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24) 589 faligndata %f12, %f14, %f28 590 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16) 591 EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16) 592 faligndata %f14, %f0, %f30 593 EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8) 594 add %o0, BLOCK_SIZE, %o0 595 bgu,pt %xcc, .Lunalign_loop 596 prefetch [%o4 + (5 * BLOCK_SIZE)], 20 597 ba .Lunalign_done 598 nop 599 600.Lunalign_large: 601 andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 602 bz %xcc, .Lunalignsrc 603 sub %o3, 64, %o3 ! %o3 will be multiple of 8 604 neg %o3 ! bytes until dest is 64 byte aligned 605 sub %o2, %o3, %o2 ! update cnt with bytes to be moved 606 ! Move bytes according to source alignment 607 andcc %o1, 0x1, %o5 608 bnz %xcc, .Lunalignbyte ! check for byte alignment 609 nop 610 andcc %o1, 2, %o5 ! check for half word alignment 611 bnz %xcc, .Lunalignhalf 612 nop 613 ! Src is word aligned 614.Lunalignword: 615 EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes 616 add %o1, 8, %o1 ! increase src ptr by 8 617 EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4 618 subcc %o3, 8, %o3 ! decrease count by 8 619 EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4 620 add %o0, 8, %o0 ! increase dst ptr by 8 621 bnz %xcc, .Lunalignword 622 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4) 623 ba .Lunalignsrc 624 nop 625 626 ! Src is half-word aligned 627.Lunalignhalf: 628 EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes 629 sllx %o4, 32, %o5 ! shift left 630 EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3) 631 or %o4, %o5, %o5 632 sllx %o5, 16, %o5 633 EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3) 634 or %o4, %o5, %o5 635 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 636 add %o1, 8, %o1 637 subcc %o3, 8, %o3 638 bnz %xcc, .Lunalignhalf 639 add %o0, 8, %o0 640 ba .Lunalignsrc 641 nop 642 643 ! Src is Byte aligned 644.Lunalignbyte: 645 sub %o0, %o1, %o0 ! share pointer advance 646.Lunalignbyte_loop: 647 EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3) 648 sllx %o4, 56, %o5 649 EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3) 650 sllx %o4, 40, %o4 651 or %o4, %o5, %o5 652 EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3) 653 sllx %o4, 24, %o4 654 or %o4, %o5, %o5 655 EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3) 656 sllx %o4, 8, %o4 657 or %o4, %o5, %o5 658 EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3) 659 or %o4, %o5, %o5 660 add %o0, %o1, %o0 661 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 662 sub %o0, %o1, %o0 663 subcc %o3, 8, %o3 664 bnz %xcc, .Lunalignbyte_loop 665 add %o1, 8, %o1 666 add %o0,%o1, %o0 ! restore pointer 667 668 ! Destination is now block (64 byte aligned) 669.Lunalignsrc: 670 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 671 and %o2, 0x3f, %o2 ! residue bytes in %o2 672 add %o2, 64, %o2 ! Insure we do not load beyond 673 sub %o5, 64, %o5 ! end of source buffer 674 675 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 676 alignaddr %o1, %g0, %g0 ! generate %gsr 677 add %o1, %o5, %o1 ! advance %o1 to after blocks 678 679 EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5) 680 add %o4, 8, %o4 681.Lunalign_sloop: 682 EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5) 683 faligndata %f14, %f16, %f0 684 EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5) 685 faligndata %f16, %f18, %f2 686 EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5) 687 faligndata %f18, %f20, %f4 688 EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5) 689 subcc %o5, 64, %o5 690 EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56) 691 faligndata %f20, %f22, %f6 692 EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 693 EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48) 694 faligndata %f22, %f24, %f8 695 EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 696 EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40) 697 faligndata %f24, %f26, %f10 698 EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 699 EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40) 700 faligndata %f26, %f28, %f12 701 EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40) 702 add %o4, 64, %o4 703 EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40) 704 faligndata %f28, %f30, %f14 705 EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40) 706 EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40) 707 add %o0, 64, %o0 708 EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40) 709 fsrc2 %f30, %f14 710 bgu,pt %xcc, .Lunalign_sloop 711 prefetch [%o4 + (8 * BLOCK_SIZE)], 20 712 713.Lunalign_done: 714 ! Handle trailing bytes, 64 to 127 715 ! Dest long word aligned, Src not long word aligned 716 cmp %o2, 15 717 bleu %xcc, .Lunalign_short 718 719 andn %o2, 0x7, %o5 ! %o5 is multiple of 8 720 and %o2, 0x7, %o2 ! residue bytes in %o2 721 add %o2, 8, %o2 722 sub %o5, 8, %o5 ! insure we do not load past end of src 723 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 724 add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 725 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword 726.Lunalign_by8: 727 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 728 add %o4, 8, %o4 729 faligndata %f0, %f2, %f16 730 subcc %o5, 8, %o5 731 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5) 732 fsrc2 %f2, %f0 733 bgu,pt %xcc, .Lunalign_by8 734 add %o0, 8, %o0 735 736.Lunalign_short: 737#ifdef NON_USER_COPY 738 VISExitHalfFast 739#else 740 VISExitHalf 741#endif 742 ba .Lsmallrest 743 nop 744 745/* 746 * This is a special case of nested memcpy. This can happen when kernel 747 * calls unaligned memcpy back to back without saving FP registers. We need 748 * traps(context switch) to save/restore FP registers. If the kernel calls 749 * memcpy without this trap sequence we will hit FP corruption. Let's use 750 * the normal integer load/store method in this case. 751 */ 752 753#ifdef NON_USER_COPY 754.Lmedium_vis_entry_fail_cp: 755 or %o0, %o1, %g2 756#endif 757.Lmedium_cp: 758 LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 759 andcc %g2, 0x7, %g0 760 bne,pn %xcc, .Lmedium_unaligned_cp 761 nop 762 763.Lmedium_noprefetch_cp: 764 andncc %o2, 0x20 - 1, %o5 765 be,pn %xcc, 2f 766 sub %o2, %o5, %o2 7671: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 768 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 769 EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5) 770 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 771 add %o1, 0x20, %o1 772 subcc %o5, 0x20, %o5 773 EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 774 EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 775 EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 776 EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 777 bne,pt %xcc, 1b 778 add %o0, 0x20, %o0 7792: andcc %o2, 0x18, %o5 780 be,pt %xcc, 3f 781 sub %o2, %o5, %o2 7821: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 783 add %o1, 0x08, %o1 784 add %o0, 0x08, %o0 785 subcc %o5, 0x08, %o5 786 bne,pt %xcc, 1b 787 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 7883: brz,pt %o2, .Lexit_cp 789 cmp %o2, 0x04 790 bl,pn %xcc, .Ltiny_cp 791 nop 792 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2) 793 add %o1, 0x04, %o1 794 add %o0, 0x04, %o0 795 subcc %o2, 0x04, %o2 796 bne,pn %xcc, .Ltiny_cp 797 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4) 798 ba,a,pt %xcc, .Lexit_cp 799 800.Lmedium_unaligned_cp: 801 /* First get dest 8 byte aligned. */ 802 sub %g0, %o0, %o3 803 and %o3, 0x7, %o3 804 brz,pt %o3, 2f 805 sub %o2, %o3, %o2 806 8071: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 808 add %o1, 1, %o1 809 subcc %o3, 1, %o3 810 add %o0, 1, %o0 811 bne,pt %xcc, 1b 812 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 8132: 814 and %o1, 0x7, %o3 815 brz,pn %o3, .Lmedium_noprefetch_cp 816 sll %o3, 3, %o3 817 mov 64, %g2 818 sub %g2, %o3, %g2 819 andn %o1, 0x7, %o1 820 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 821 sllx %o4, %o3, %o4 822 andn %o2, 0x08 - 1, %o5 823 sub %o2, %o5, %o2 824 8251: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 826 add %o1, 0x08, %o1 827 subcc %o5, 0x08, %o5 828 srlx %g3, %g2, %g7 829 or %g7, %o4, %g7 830 EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 831 add %o0, 0x08, %o0 832 bne,pt %xcc, 1b 833 sllx %g3, %o3, %o4 834 srl %o3, 3, %o3 835 add %o1, %o3, %o1 836 brz,pn %o2, .Lexit_cp 837 nop 838 ba,pt %xcc, .Lsmall_unaligned_cp 839 840.Ltiny_cp: 841 EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 842 subcc %o2, 1, %o2 843 be,pn %xcc, .Lexit_cp 844 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1) 845 EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2) 846 subcc %o2, 1, %o2 847 be,pn %xcc, .Lexit_cp 848 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1) 849 EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2) 850 ba,pt %xcc, .Lexit_cp 851 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2) 852 853.Lsmall_cp: 854 andcc %g2, 0x3, %g0 855 bne,pn %xcc, .Lsmall_unaligned_cp 856 andn %o2, 0x4 - 1, %o5 857 sub %o2, %o5, %o2 8581: 859 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 860 add %o1, 0x04, %o1 861 subcc %o5, 0x04, %o5 862 add %o0, 0x04, %o0 863 bne,pt %xcc, 1b 864 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 865 brz,pt %o2, .Lexit_cp 866 nop 867 ba,a,pt %xcc, .Ltiny_cp 868 869.Lsmall_unaligned_cp: 8701: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 871 add %o1, 1, %o1 872 add %o0, 1, %o0 873 subcc %o2, 1, %o2 874 bne,pt %xcc, 1b 875 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1) 876 ba,a,pt %xcc, .Lexit_cp 877 878.Lsmallrest: 879 tst %o2 880 bz,pt %xcc, .Lsmallx 881 cmp %o2, 4 882 blt,pn %xcc, .Lsmallleft3 883 nop 884 sub %o2, 3, %o2 885.Lsmallnotalign4: 886 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte 887 subcc %o2, 4, %o2 ! reduce count by 4 888 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat 889 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4 890 add %o1, 4, %o1 ! advance SRC by 4 891 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6) 892 EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5) 893 add %o0, 4, %o0 ! advance DST by 4 894 EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5) 895 EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4) 896 bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain 897 EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4) 898 addcc %o2, 3, %o2 ! restore count 899 bz,pt %xcc, .Lsmallx 900.Lsmallleft3: ! 1, 2, or 3 bytes remain 901 subcc %o2, 1, %o2 902 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte 903 bz,pt %xcc, .Lsmallx 904 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte 905 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte 906 subcc %o2, 1, %o2 907 bz,pt %xcc, .Lsmallx 908 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte 909 EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte 910 EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte 911.Lsmallx: 912 retl 913 mov EX_RETVAL(%g1), %o0 914.Lsmallfin: 915 tst %o2 916 bnz,pn %xcc, .Lsmallleft3 917 nop 918 retl 919 mov EX_RETVAL(%g1), %o0 ! restore %o0 920.Lexit_cp: 921 retl 922 mov EX_RETVAL(%g1), %o0 923 .size FUNC_NAME, .-FUNC_NAME 924