1b3a04ed5SBabu Moger/* 2b3a04ed5SBabu Moger * M7memcpy: Optimized SPARC M7 memcpy 3b3a04ed5SBabu Moger * 4b3a04ed5SBabu Moger * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5b3a04ed5SBabu Moger */ 6b3a04ed5SBabu Moger 7b3a04ed5SBabu Moger .file "M7memcpy.S" 8b3a04ed5SBabu Moger 9b3a04ed5SBabu Moger/* 10b3a04ed5SBabu Moger * memcpy(s1, s2, len) 11b3a04ed5SBabu Moger * 12b3a04ed5SBabu Moger * Copy s2 to s1, always copy n bytes. 13b3a04ed5SBabu Moger * Note: this C code does not work for overlapped copies. 14b3a04ed5SBabu Moger * 15b3a04ed5SBabu Moger * Fast assembler language version of the following C-program for memcpy 16b3a04ed5SBabu Moger * which represents the `standard' for the C-library. 17b3a04ed5SBabu Moger * 18b3a04ed5SBabu Moger * void * 19b3a04ed5SBabu Moger * memcpy(void *s, const void *s0, size_t n) 20b3a04ed5SBabu Moger * { 21b3a04ed5SBabu Moger * if (n != 0) { 22b3a04ed5SBabu Moger * char *s1 = s; 23b3a04ed5SBabu Moger * const char *s2 = s0; 24b3a04ed5SBabu Moger * do { 25b3a04ed5SBabu Moger * *s1++ = *s2++; 26b3a04ed5SBabu Moger * } while (--n != 0); 27b3a04ed5SBabu Moger * } 28b3a04ed5SBabu Moger * return (s); 29b3a04ed5SBabu Moger * } 30b3a04ed5SBabu Moger * 31b3a04ed5SBabu Moger * 32b3a04ed5SBabu Moger * SPARC T7/M7 Flow : 33b3a04ed5SBabu Moger * 34b3a04ed5SBabu Moger * if (count < SMALL_MAX) { 35b3a04ed5SBabu Moger * if count < SHORTCOPY (SHORTCOPY=3) 36b3a04ed5SBabu Moger * copy bytes; exit with dst addr 37b3a04ed5SBabu Moger * if src & dst aligned on word boundary but not long word boundary, 38b3a04ed5SBabu Moger * copy with ldw/stw; branch to finish_up 39b3a04ed5SBabu Moger * if src & dst aligned on long word boundary 40b3a04ed5SBabu Moger * copy with ldx/stx; branch to finish_up 41b3a04ed5SBabu Moger * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) 42b3a04ed5SBabu Moger * copy bytes; exit with dst addr 43b3a04ed5SBabu Moger * move enough bytes to get src to word boundary 44b3a04ed5SBabu Moger * if dst now on word boundary 45b3a04ed5SBabu Moger * move_words: 46b3a04ed5SBabu Moger * copy words; branch to finish_up 47b3a04ed5SBabu Moger * if dst now on half word boundary 48b3a04ed5SBabu Moger * load words, shift half words, store words; branch to finish_up 49b3a04ed5SBabu Moger * if dst on byte 1 50b3a04ed5SBabu Moger * load words, shift 3 bytes, store words; branch to finish_up 51b3a04ed5SBabu Moger * if dst on byte 3 52b3a04ed5SBabu Moger * load words, shift 1 byte, store words; branch to finish_up 53b3a04ed5SBabu Moger * finish_up: 54b3a04ed5SBabu Moger * copy bytes; exit with dst addr 55b3a04ed5SBabu Moger * } else { More than SMALL_MAX bytes 56b3a04ed5SBabu Moger * move bytes until dst is on long word boundary 57b3a04ed5SBabu Moger * if( src is on long word boundary ) { 58b3a04ed5SBabu Moger * if (count < MED_MAX) { 59b3a04ed5SBabu Moger * finish_long: src/dst aligned on 8 bytes 60b3a04ed5SBabu Moger * copy with ldx/stx in 8-way unrolled loop; 61b3a04ed5SBabu Moger * copy final 0-63 bytes; exit with dst addr 62b3a04ed5SBabu Moger * } else { src/dst aligned; count > MED_MAX 63b3a04ed5SBabu Moger * align dst on 64 byte boundary; for main data movement: 64b3a04ed5SBabu Moger * prefetch src data to L2 cache; let HW prefetch move data to L1 cache 65b3a04ed5SBabu Moger * Use BIS (block initializing store) to avoid copying store cache 66b3a04ed5SBabu Moger * lines from memory. But pre-store first element of each cache line 67b3a04ed5SBabu Moger * ST_CHUNK lines in advance of the rest of that cache line. That 68b3a04ed5SBabu Moger * gives time for replacement cache lines to be written back without 69b3a04ed5SBabu Moger * excess STQ and Miss Buffer filling. Repeat until near the end, 70b3a04ed5SBabu Moger * then finish up storing before going to finish_long. 71b3a04ed5SBabu Moger * } 72b3a04ed5SBabu Moger * } else { src/dst not aligned on 8 bytes 73b3a04ed5SBabu Moger * if src is word aligned and count < MED_WMAX 74b3a04ed5SBabu Moger * move words in 8-way unrolled loop 75b3a04ed5SBabu Moger * move final 0-31 bytes; exit with dst addr 76b3a04ed5SBabu Moger * if count < MED_UMAX 77b3a04ed5SBabu Moger * use alignaddr/faligndata combined with ldd/std in 8-way 78b3a04ed5SBabu Moger * unrolled loop to move data. 79b3a04ed5SBabu Moger * go to unalign_done 80b3a04ed5SBabu Moger * else 81b3a04ed5SBabu Moger * setup alignaddr for faligndata instructions 82b3a04ed5SBabu Moger * align dst on 64 byte boundary; prefetch src data to L1 cache 83b3a04ed5SBabu Moger * loadx8, falign, block-store, prefetch loop 84b3a04ed5SBabu Moger * (only use block-init-store when src/dst on 8 byte boundaries.) 85b3a04ed5SBabu Moger * unalign_done: 86b3a04ed5SBabu Moger * move remaining bytes for unaligned cases. exit with dst addr. 87b3a04ed5SBabu Moger * } 88b3a04ed5SBabu Moger * 89b3a04ed5SBabu Moger */ 90b3a04ed5SBabu Moger 91b3a04ed5SBabu Moger#include <asm/visasm.h> 92b3a04ed5SBabu Moger#include <asm/asi.h> 93b3a04ed5SBabu Moger 94b3a04ed5SBabu Moger#if !defined(EX_LD) && !defined(EX_ST) 95b3a04ed5SBabu Moger#define NON_USER_COPY 96b3a04ed5SBabu Moger#endif 97b3a04ed5SBabu Moger 98b3a04ed5SBabu Moger#ifndef EX_LD 9934060b8fSBabu Moger#define EX_LD(x,y) x 100b3a04ed5SBabu Moger#endif 101b3a04ed5SBabu Moger#ifndef EX_LD_FP 10234060b8fSBabu Moger#define EX_LD_FP(x,y) x 103b3a04ed5SBabu Moger#endif 104b3a04ed5SBabu Moger 105b3a04ed5SBabu Moger#ifndef EX_ST 10634060b8fSBabu Moger#define EX_ST(x,y) x 107b3a04ed5SBabu Moger#endif 108b3a04ed5SBabu Moger#ifndef EX_ST_FP 10934060b8fSBabu Moger#define EX_ST_FP(x,y) x 110b3a04ed5SBabu Moger#endif 111b3a04ed5SBabu Moger 112b3a04ed5SBabu Moger#ifndef EX_RETVAL 113b3a04ed5SBabu Moger#define EX_RETVAL(x) x 114b3a04ed5SBabu Moger#endif 115b3a04ed5SBabu Moger 116b3a04ed5SBabu Moger#ifndef LOAD 117b3a04ed5SBabu Moger#define LOAD(type,addr,dest) type [addr], dest 118b3a04ed5SBabu Moger#endif 119b3a04ed5SBabu Moger 120b3a04ed5SBabu Moger#ifndef STORE 121b3a04ed5SBabu Moger#define STORE(type,src,addr) type src, [addr] 122b3a04ed5SBabu Moger#endif 123b3a04ed5SBabu Moger 124b3a04ed5SBabu Moger/* 125b3a04ed5SBabu Moger * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache 126b3a04ed5SBabu Moger * line as "least recently used" which means if many threads are 127b3a04ed5SBabu Moger * active, it has a high probability of being pushed out of the cache 128b3a04ed5SBabu Moger * between the first initializing store and the final stores. 129b3a04ed5SBabu Moger * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which 130b3a04ed5SBabu Moger * marks the cache line as "most recently used" for all 131b3a04ed5SBabu Moger * but the last cache line 132b3a04ed5SBabu Moger */ 133b3a04ed5SBabu Moger#ifndef STORE_ASI 134b3a04ed5SBabu Moger#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 135b3a04ed5SBabu Moger#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 136b3a04ed5SBabu Moger#else 137b3a04ed5SBabu Moger#define STORE_ASI 0x80 /* ASI_P */ 138b3a04ed5SBabu Moger#endif 139b3a04ed5SBabu Moger#endif 140b3a04ed5SBabu Moger 141b3a04ed5SBabu Moger#ifndef STORE_MRU_ASI 142b3a04ed5SBabu Moger#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 143b3a04ed5SBabu Moger#define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 144b3a04ed5SBabu Moger#else 145b3a04ed5SBabu Moger#define STORE_MRU_ASI 0x80 /* ASI_P */ 146b3a04ed5SBabu Moger#endif 147b3a04ed5SBabu Moger#endif 148b3a04ed5SBabu Moger 149b3a04ed5SBabu Moger#ifndef STORE_INIT 150b3a04ed5SBabu Moger#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 151b3a04ed5SBabu Moger#endif 152b3a04ed5SBabu Moger 153b3a04ed5SBabu Moger#ifndef STORE_INIT_MRU 154b3a04ed5SBabu Moger#define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI 155b3a04ed5SBabu Moger#endif 156b3a04ed5SBabu Moger 157b3a04ed5SBabu Moger#ifndef FUNC_NAME 158b3a04ed5SBabu Moger#define FUNC_NAME M7memcpy 159b3a04ed5SBabu Moger#endif 160b3a04ed5SBabu Moger 161b3a04ed5SBabu Moger#ifndef PREAMBLE 162b3a04ed5SBabu Moger#define PREAMBLE 163b3a04ed5SBabu Moger#endif 164b3a04ed5SBabu Moger 165b3a04ed5SBabu Moger#define BLOCK_SIZE 64 166b3a04ed5SBabu Moger#define SHORTCOPY 3 167b3a04ed5SBabu Moger#define SHORTCHECK 14 168b3a04ed5SBabu Moger#define SHORT_LONG 64 /* max copy for short longword-aligned case */ 169b3a04ed5SBabu Moger /* must be at least 64 */ 170b3a04ed5SBabu Moger#define SMALL_MAX 128 171b3a04ed5SBabu Moger#define MED_UMAX 1024 /* max copy for medium un-aligned case */ 172b3a04ed5SBabu Moger#define MED_WMAX 1024 /* max copy for medium word-aligned case */ 173b3a04ed5SBabu Moger#define MED_MAX 1024 /* max copy for medium longword-aligned case */ 174b3a04ed5SBabu Moger#define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ 175b3a04ed5SBabu Moger#define ALIGN_PRE 24 /* distance for aligned prefetch loop */ 176b3a04ed5SBabu Moger 177b3a04ed5SBabu Moger .register %g2,#scratch 178b3a04ed5SBabu Moger 179b3a04ed5SBabu Moger .section ".text" 180b3a04ed5SBabu Moger .global FUNC_NAME 181b3a04ed5SBabu Moger .type FUNC_NAME, #function 182b3a04ed5SBabu Moger .align 16 183b3a04ed5SBabu MogerFUNC_NAME: 184b3a04ed5SBabu Moger srlx %o2, 31, %g2 185b3a04ed5SBabu Moger cmp %g2, 0 186b3a04ed5SBabu Moger tne %xcc, 5 187b3a04ed5SBabu Moger PREAMBLE 188b3a04ed5SBabu Moger mov %o0, %g1 ! save %o0 189b3a04ed5SBabu Moger brz,pn %o2, .Lsmallx 190b3a04ed5SBabu Moger cmp %o2, 3 191b3a04ed5SBabu Moger ble,pn %icc, .Ltiny_cp 192b3a04ed5SBabu Moger cmp %o2, 19 193b3a04ed5SBabu Moger ble,pn %icc, .Lsmall_cp 194b3a04ed5SBabu Moger or %o0, %o1, %g2 195b3a04ed5SBabu Moger cmp %o2, SMALL_MAX 196b3a04ed5SBabu Moger bl,pn %icc, .Lmedium_cp 197b3a04ed5SBabu Moger nop 198b3a04ed5SBabu Moger 199b3a04ed5SBabu Moger.Lmedium: 200b3a04ed5SBabu Moger neg %o0, %o5 201b3a04ed5SBabu Moger andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 202b3a04ed5SBabu Moger brz,pt %o5, .Ldst_aligned_on_8 203b3a04ed5SBabu Moger 204b3a04ed5SBabu Moger ! %o5 has the bytes to be written in partial store. 205b3a04ed5SBabu Moger sub %o2, %o5, %o2 206b3a04ed5SBabu Moger sub %o1, %o0, %o1 ! %o1 gets the difference 207b3a04ed5SBabu Moger7: ! dst aligning loop 208b3a04ed5SBabu Moger add %o1, %o0, %o4 20934060b8fSBabu Moger EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte 210b3a04ed5SBabu Moger subcc %o5, 1, %o5 21134060b8fSBabu Moger EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1) 212b3a04ed5SBabu Moger bgu,pt %xcc, 7b 213b3a04ed5SBabu Moger add %o0, 1, %o0 ! advance dst 214b3a04ed5SBabu Moger add %o1, %o0, %o1 ! restore %o1 215b3a04ed5SBabu Moger.Ldst_aligned_on_8: 216b3a04ed5SBabu Moger andcc %o1, 7, %o5 217b3a04ed5SBabu Moger brnz,pt %o5, .Lsrc_dst_unaligned_on_8 218b3a04ed5SBabu Moger nop 219b3a04ed5SBabu Moger 220b3a04ed5SBabu Moger.Lsrc_dst_aligned_on_8: 221b3a04ed5SBabu Moger ! check if we are copying MED_MAX or more bytes 222b3a04ed5SBabu Moger set MED_MAX, %o3 223b3a04ed5SBabu Moger cmp %o2, %o3 ! limit to store buffer size 224b3a04ed5SBabu Moger bgu,pn %xcc, .Llarge_align8_copy 225b3a04ed5SBabu Moger nop 226b3a04ed5SBabu Moger 227b3a04ed5SBabu Moger/* 228b3a04ed5SBabu Moger * Special case for handling when src and dest are both long word aligned 229b3a04ed5SBabu Moger * and total data to move is less than MED_MAX bytes 230b3a04ed5SBabu Moger */ 231b3a04ed5SBabu Moger.Lmedlong: 232b3a04ed5SBabu Moger subcc %o2, 63, %o2 ! adjust length to allow cc test 233b3a04ed5SBabu Moger ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes 234b3a04ed5SBabu Moger nop 235b3a04ed5SBabu Moger.Lmedl64: 23634060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load 237b3a04ed5SBabu Moger subcc %o2, 64, %o2 ! decrement length count 23834060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store 23934060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64 24034060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56) 24134060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48) 24234060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48) 24334060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40) 24434060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40) 24534060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store 24634060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32) 24734060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64 248b3a04ed5SBabu Moger add %o1, 64, %o1 ! increase src ptr by 64 24934060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24) 25034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16) 251b3a04ed5SBabu Moger add %o0, 64, %o0 ! increase dst ptr by 64 25234060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16) 25334060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8) 254b3a04ed5SBabu Moger bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left 25534060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8) 256b3a04ed5SBabu Moger.Lmedl63: 257b3a04ed5SBabu Moger addcc %o2, 32, %o2 ! adjust remaining count 258b3a04ed5SBabu Moger ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left 259b3a04ed5SBabu Moger nop 26034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load 261b3a04ed5SBabu Moger sub %o2, 32, %o2 ! decrement length count 26234060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store 26334060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32 264b3a04ed5SBabu Moger add %o1, 32, %o1 ! increase src ptr by 32 26534060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24) 26634060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 267b3a04ed5SBabu Moger add %o0, 32, %o0 ! increase dst ptr by 32 26834060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16) 26934060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8) 27034060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8) 271b3a04ed5SBabu Moger.Lmedl31: 272b3a04ed5SBabu Moger addcc %o2, 16, %o2 ! adjust remaining count 273b3a04ed5SBabu Moger ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left 274b3a04ed5SBabu Moger nop ! 27534060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15) 276b3a04ed5SBabu Moger add %o1, 16, %o1 ! increase src ptr by 16 27734060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15) 278b3a04ed5SBabu Moger sub %o2, 16, %o2 ! decrease count by 16 27934060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8) 280b3a04ed5SBabu Moger add %o0, 16, %o0 ! increase dst ptr by 16 28134060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8) 282b3a04ed5SBabu Moger.Lmedl15: 283b3a04ed5SBabu Moger addcc %o2, 15, %o2 ! restore count 284b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx ! exit if finished 285b3a04ed5SBabu Moger cmp %o2, 8 286b3a04ed5SBabu Moger blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 287b3a04ed5SBabu Moger tst %o2 28834060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes 289b3a04ed5SBabu Moger add %o1, 8, %o1 ! increase src ptr by 8 290b3a04ed5SBabu Moger add %o0, 8, %o0 ! increase dst ptr by 8 291b3a04ed5SBabu Moger subcc %o2, 8, %o2 ! decrease count by 8 292b3a04ed5SBabu Moger bnz,pn %xcc, .Lmedw7 29334060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8 294b3a04ed5SBabu Moger retl 295b3a04ed5SBabu Moger mov EX_RETVAL(%g1), %o0 ! restore %o0 296b3a04ed5SBabu Moger 297b3a04ed5SBabu Moger .align 16 298b3a04ed5SBabu Moger.Lsrc_dst_unaligned_on_8: 299b3a04ed5SBabu Moger ! DST is 8-byte aligned, src is not 300b3a04ed5SBabu Moger2: 301b3a04ed5SBabu Moger andcc %o1, 0x3, %o5 ! test word alignment 302b3a04ed5SBabu Moger bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned 303b3a04ed5SBabu Moger nop 304b3a04ed5SBabu Moger 305b3a04ed5SBabu Moger/* 306b3a04ed5SBabu Moger * Handle all cases where src and dest are aligned on word 307b3a04ed5SBabu Moger * boundaries. Use unrolled loops for better performance. 308b3a04ed5SBabu Moger * This option wins over standard large data move when 309b3a04ed5SBabu Moger * source and destination is in cache for.Lmedium 310b3a04ed5SBabu Moger * to short data moves. 311b3a04ed5SBabu Moger */ 312b3a04ed5SBabu Moger set MED_WMAX, %o3 313b3a04ed5SBabu Moger cmp %o2, %o3 ! limit to store buffer size 314b3a04ed5SBabu Moger bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop 315b3a04ed5SBabu Moger nop 316b3a04ed5SBabu Moger 317b3a04ed5SBabu Moger subcc %o2, 31, %o2 ! adjust length to allow cc test 318b3a04ed5SBabu Moger ! for end of loop 319b3a04ed5SBabu Moger ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 320b3a04ed5SBabu Moger.Lmedw32: 32134060b8fSBabu Moger EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32 322b3a04ed5SBabu Moger sllx %o4, 32, %o5 32334060b8fSBabu Moger EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31) 324b3a04ed5SBabu Moger or %o4, %o5, %o5 32534060b8fSBabu Moger EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31) 326b3a04ed5SBabu Moger subcc %o2, 32, %o2 ! decrement length count 32734060b8fSBabu Moger EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24) 328b3a04ed5SBabu Moger sllx %o4, 32, %o5 32934060b8fSBabu Moger EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24) 330b3a04ed5SBabu Moger or %o4, %o5, %o5 33134060b8fSBabu Moger EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24) 332b3a04ed5SBabu Moger add %o1, 32, %o1 ! increase src ptr by 32 33334060b8fSBabu Moger EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 334b3a04ed5SBabu Moger sllx %o4, 32, %o5 33534060b8fSBabu Moger EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16) 336b3a04ed5SBabu Moger or %o4, %o5, %o5 33734060b8fSBabu Moger EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16) 338b3a04ed5SBabu Moger add %o0, 32, %o0 ! increase dst ptr by 32 33934060b8fSBabu Moger EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8) 340b3a04ed5SBabu Moger sllx %o4, 32, %o5 34134060b8fSBabu Moger EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8) 342b3a04ed5SBabu Moger or %o4, %o5, %o5 343b3a04ed5SBabu Moger bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left 34434060b8fSBabu Moger EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8) 345b3a04ed5SBabu Moger.Lmedw31: 346b3a04ed5SBabu Moger addcc %o2, 31, %o2 ! restore count 347b3a04ed5SBabu Moger 348b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx ! exit if finished 349b3a04ed5SBabu Moger nop 350b3a04ed5SBabu Moger cmp %o2, 16 351b3a04ed5SBabu Moger blt,pt %xcc, .Lmedw15 352b3a04ed5SBabu Moger nop 35334060b8fSBabu Moger EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes 354b3a04ed5SBabu Moger sllx %o4, 32, %o5 355b3a04ed5SBabu Moger subcc %o2, 16, %o2 ! decrement length count 35634060b8fSBabu Moger EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16) 357b3a04ed5SBabu Moger or %o4, %o5, %o5 35834060b8fSBabu Moger EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16) 359b3a04ed5SBabu Moger add %o1, 16, %o1 ! increase src ptr by 16 36034060b8fSBabu Moger EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8) 361b3a04ed5SBabu Moger add %o0, 16, %o0 ! increase dst ptr by 16 362b3a04ed5SBabu Moger sllx %o4, 32, %o5 36334060b8fSBabu Moger EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8) 364b3a04ed5SBabu Moger or %o4, %o5, %o5 36534060b8fSBabu Moger EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8) 366b3a04ed5SBabu Moger.Lmedw15: 367b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx ! exit if finished 368b3a04ed5SBabu Moger cmp %o2, 8 369b3a04ed5SBabu Moger blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 370b3a04ed5SBabu Moger tst %o2 37134060b8fSBabu Moger EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 372b3a04ed5SBabu Moger subcc %o2, 8, %o2 ! decrease count by 8 37334060b8fSBabu Moger EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes 374b3a04ed5SBabu Moger add %o1, 8, %o1 ! increase src ptr by 8 37534060b8fSBabu Moger EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes 376b3a04ed5SBabu Moger add %o0, 8, %o0 ! increase dst ptr by 8 37734060b8fSBabu Moger EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 378b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx ! exit if finished 379b3a04ed5SBabu Moger.Lmedw7: ! count is ge 1, less than 8 380b3a04ed5SBabu Moger cmp %o2, 4 ! check for 4 bytes left 381b3a04ed5SBabu Moger blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left 382b3a04ed5SBabu Moger nop ! 38334060b8fSBabu Moger EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 384b3a04ed5SBabu Moger add %o1, 4, %o1 ! increase src ptr by 4 385b3a04ed5SBabu Moger add %o0, 4, %o0 ! increase dst ptr by 4 386b3a04ed5SBabu Moger subcc %o2, 4, %o2 ! decrease count by 4 387b3a04ed5SBabu Moger bnz .Lsmallleft3 38834060b8fSBabu Moger EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 389b3a04ed5SBabu Moger retl 390b3a04ed5SBabu Moger mov EX_RETVAL(%g1), %o0 391b3a04ed5SBabu Moger 392b3a04ed5SBabu Moger .align 16 393b3a04ed5SBabu Moger.Llarge_align8_copy: ! Src and dst share 8 byte alignment 394b3a04ed5SBabu Moger ! align dst to 64 byte boundary 395b3a04ed5SBabu Moger andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 396b3a04ed5SBabu Moger brz,pn %o3, .Laligned_to_64 397b3a04ed5SBabu Moger andcc %o0, 8, %o3 ! odd long words to move? 398b3a04ed5SBabu Moger brz,pt %o3, .Laligned_to_16 399b3a04ed5SBabu Moger nop 40034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 401b3a04ed5SBabu Moger sub %o2, 8, %o2 402b3a04ed5SBabu Moger add %o1, 8, %o1 ! increment src ptr 403b3a04ed5SBabu Moger add %o0, 8, %o0 ! increment dst ptr 40434060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 405b3a04ed5SBabu Moger.Laligned_to_16: 406b3a04ed5SBabu Moger andcc %o0, 16, %o3 ! pair of long words to move? 407b3a04ed5SBabu Moger brz,pt %o3, .Laligned_to_32 408b3a04ed5SBabu Moger nop 40934060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 410b3a04ed5SBabu Moger sub %o2, 16, %o2 41134060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16) 412b3a04ed5SBabu Moger add %o1, 16, %o1 ! increment src ptr 41334060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 414b3a04ed5SBabu Moger add %o0, 16, %o0 ! increment dst ptr 41534060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 416b3a04ed5SBabu Moger.Laligned_to_32: 417b3a04ed5SBabu Moger andcc %o0, 32, %o3 ! four long words to move? 418b3a04ed5SBabu Moger brz,pt %o3, .Laligned_to_64 419b3a04ed5SBabu Moger nop 42034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 421b3a04ed5SBabu Moger sub %o2, 32, %o2 42234060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32) 42334060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24) 42434060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24) 42534060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16) 42634060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16) 427b3a04ed5SBabu Moger add %o1, 32, %o1 ! increment src ptr 42834060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 429b3a04ed5SBabu Moger add %o0, 32, %o0 ! increment dst ptr 43034060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 431b3a04ed5SBabu Moger.Laligned_to_64: 432b3a04ed5SBabu Moger! 433b3a04ed5SBabu Moger! Using block init store (BIS) instructions to avoid fetching cache 434b3a04ed5SBabu Moger! lines from memory. Use ST_CHUNK stores to first element of each cache 435b3a04ed5SBabu Moger! line (similar to prefetching) to avoid overfilling STQ or miss buffers. 436b3a04ed5SBabu Moger! Gives existing cache lines time to be moved out of L1/L2/L3 cache. 437b3a04ed5SBabu Moger! Initial stores using MRU version of BIS to keep cache line in 438b3a04ed5SBabu Moger! cache until we are ready to store final element of cache line. 439b3a04ed5SBabu Moger! Then store last element using the LRU version of BIS. 440b3a04ed5SBabu Moger! 441b3a04ed5SBabu Moger andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 442b3a04ed5SBabu Moger and %o2, 0x3f, %o2 ! residue bytes in %o2 443b3a04ed5SBabu Moger! 444b3a04ed5SBabu Moger! We use STORE_MRU_ASI for the first seven stores to each cache line 445b3a04ed5SBabu Moger! followed by STORE_ASI (mark as LRU) for the last store. That 446b3a04ed5SBabu Moger! mixed approach reduces the probability that the cache line is removed 447b3a04ed5SBabu Moger! before we finish setting it, while minimizing the effects on 448b3a04ed5SBabu Moger! other cached values during a large memcpy 449b3a04ed5SBabu Moger! 450b3a04ed5SBabu Moger! ST_CHUNK batches up initial BIS operations for several cache lines 451b3a04ed5SBabu Moger! to allow multiple requests to not be blocked by overflowing the 452b3a04ed5SBabu Moger! the store miss buffer. Then the matching stores for all those 453b3a04ed5SBabu Moger! BIS operations are executed. 454b3a04ed5SBabu Moger! 455b3a04ed5SBabu Moger 456b3a04ed5SBabu Moger sub %o0, 8, %o0 ! adjust %o0 for ASI alignment 457b3a04ed5SBabu Moger.Lalign_loop: 458b3a04ed5SBabu Moger cmp %o5, ST_CHUNK*64 459b3a04ed5SBabu Moger blu,pt %xcc, .Lalign_loop_fin 460b3a04ed5SBabu Moger mov ST_CHUNK,%o3 461b3a04ed5SBabu Moger.Lalign_loop_start: 462b3a04ed5SBabu Moger prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 463b3a04ed5SBabu Moger subcc %o3, 1, %o3 46434060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 465b3a04ed5SBabu Moger add %o1, 64, %o1 466b3a04ed5SBabu Moger add %o0, 8, %o0 46734060b8fSBabu Moger EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 468b3a04ed5SBabu Moger bgu %xcc,.Lalign_loop_start 469b3a04ed5SBabu Moger add %o0, 56, %o0 470b3a04ed5SBabu Moger 471b3a04ed5SBabu Moger mov ST_CHUNK,%o3 472b3a04ed5SBabu Moger sllx %o3, 6, %o4 ! ST_CHUNK*64 473b3a04ed5SBabu Moger sub %o1, %o4, %o1 ! reset %o1 474b3a04ed5SBabu Moger sub %o0, %o4, %o0 ! reset %o0 475b3a04ed5SBabu Moger 476b3a04ed5SBabu Moger.Lalign_loop_rest: 47734060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 478b3a04ed5SBabu Moger add %o0, 16, %o0 47934060b8fSBabu Moger EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 48034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 481b3a04ed5SBabu Moger add %o0, 8, %o0 48234060b8fSBabu Moger EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 483b3a04ed5SBabu Moger subcc %o3, 1, %o3 48434060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5) 485b3a04ed5SBabu Moger add %o0, 8, %o0 48634060b8fSBabu Moger EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 48734060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5) 488b3a04ed5SBabu Moger add %o0, 8, %o0 48934060b8fSBabu Moger EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 49034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5) 491b3a04ed5SBabu Moger add %o0, 8, %o0 49234060b8fSBabu Moger EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 49334060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5) 494b3a04ed5SBabu Moger add %o1, 64, %o1 495b3a04ed5SBabu Moger add %o0, 8, %o0 49634060b8fSBabu Moger EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 497b3a04ed5SBabu Moger add %o0, 8, %o0 49834060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5) 499b3a04ed5SBabu Moger sub %o5, 64, %o5 500b3a04ed5SBabu Moger bgu %xcc,.Lalign_loop_rest 501b3a04ed5SBabu Moger ! mark cache line as LRU 50234060b8fSBabu Moger EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64) 503b3a04ed5SBabu Moger 504b3a04ed5SBabu Moger cmp %o5, ST_CHUNK*64 505b3a04ed5SBabu Moger bgu,pt %xcc, .Lalign_loop_start 506b3a04ed5SBabu Moger mov ST_CHUNK,%o3 507b3a04ed5SBabu Moger 508b3a04ed5SBabu Moger cmp %o5, 0 509b3a04ed5SBabu Moger beq .Lalign_done 510b3a04ed5SBabu Moger nop 511b3a04ed5SBabu Moger.Lalign_loop_fin: 51234060b8fSBabu Moger EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 51334060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5) 51434060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 51534060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5) 51634060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 51734060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5) 518b3a04ed5SBabu Moger subcc %o5, 64, %o5 51934060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64) 52034060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64) 52134060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64) 52234060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64) 52334060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64) 52434060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64) 52534060b8fSBabu Moger EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64) 526b3a04ed5SBabu Moger add %o1, 64, %o1 52734060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64) 528b3a04ed5SBabu Moger add %o0, 64, %o0 52934060b8fSBabu Moger EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64) 530b3a04ed5SBabu Moger bgu %xcc,.Lalign_loop_fin 53134060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64) 532b3a04ed5SBabu Moger 533b3a04ed5SBabu Moger.Lalign_done: 534b3a04ed5SBabu Moger add %o0, 8, %o0 ! restore %o0 from ASI alignment 535b3a04ed5SBabu Moger membar #StoreStore 536b3a04ed5SBabu Moger sub %o2, 63, %o2 ! adjust length to allow cc test 537b3a04ed5SBabu Moger ba .Lmedl63 ! in .Lmedl63 538b3a04ed5SBabu Moger nop 539b3a04ed5SBabu Moger 540b3a04ed5SBabu Moger .align 16 541b3a04ed5SBabu Moger ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 542b3a04ed5SBabu Moger.Lunalignsetup: 543b3a04ed5SBabu Moger.Lunalignrejoin: 544b3a04ed5SBabu Moger mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it 545b3a04ed5SBabu Moger#ifdef NON_USER_COPY 546b3a04ed5SBabu Moger VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) 547b3a04ed5SBabu Moger#else 548b3a04ed5SBabu Moger VISEntryHalf 549b3a04ed5SBabu Moger#endif 550b3a04ed5SBabu Moger mov %o3, %g1 ! restore %g1 551b3a04ed5SBabu Moger 552b3a04ed5SBabu Moger set MED_UMAX, %o3 553b3a04ed5SBabu Moger cmp %o2, %o3 ! check for.Lmedium unaligned limit 554b3a04ed5SBabu Moger bge,pt %xcc,.Lunalign_large 555b3a04ed5SBabu Moger prefetch [%o1 + (4 * BLOCK_SIZE)], 20 556b3a04ed5SBabu Moger andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 557b3a04ed5SBabu Moger and %o2, 0x3f, %o2 ! residue bytes in %o2 558b3a04ed5SBabu Moger cmp %o2, 8 ! Insure we do not load beyond 559b3a04ed5SBabu Moger bgt .Lunalign_adjust ! end of source buffer 560b3a04ed5SBabu Moger andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 561b3a04ed5SBabu Moger add %o2, 64, %o2 ! adjust to leave loop 562b3a04ed5SBabu Moger sub %o5, 64, %o5 ! early if necessary 563b3a04ed5SBabu Moger.Lunalign_adjust: 564b3a04ed5SBabu Moger alignaddr %o1, %g0, %g0 ! generate %gsr 565b3a04ed5SBabu Moger add %o1, %o5, %o1 ! advance %o1 to after blocks 56634060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5) 567b3a04ed5SBabu Moger.Lunalign_loop: 56834060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 569b3a04ed5SBabu Moger faligndata %f0, %f2, %f16 57034060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5) 571b3a04ed5SBabu Moger subcc %o5, BLOCK_SIZE, %o5 57234060b8fSBabu Moger EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64) 573b3a04ed5SBabu Moger faligndata %f2, %f4, %f18 57434060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56) 57534060b8fSBabu Moger EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 576b3a04ed5SBabu Moger faligndata %f4, %f6, %f20 57734060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48) 57834060b8fSBabu Moger EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 579b3a04ed5SBabu Moger faligndata %f6, %f8, %f22 58034060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40) 58134060b8fSBabu Moger EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 582b3a04ed5SBabu Moger faligndata %f8, %f10, %f24 58334060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32) 58434060b8fSBabu Moger EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32) 585b3a04ed5SBabu Moger faligndata %f10, %f12, %f26 58634060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24) 587b3a04ed5SBabu Moger add %o4, BLOCK_SIZE, %o4 58834060b8fSBabu Moger EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24) 589b3a04ed5SBabu Moger faligndata %f12, %f14, %f28 59034060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16) 59134060b8fSBabu Moger EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16) 592b3a04ed5SBabu Moger faligndata %f14, %f0, %f30 59334060b8fSBabu Moger EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8) 594b3a04ed5SBabu Moger add %o0, BLOCK_SIZE, %o0 595b3a04ed5SBabu Moger bgu,pt %xcc, .Lunalign_loop 596b3a04ed5SBabu Moger prefetch [%o4 + (5 * BLOCK_SIZE)], 20 597b3a04ed5SBabu Moger ba .Lunalign_done 598b3a04ed5SBabu Moger nop 599b3a04ed5SBabu Moger 600b3a04ed5SBabu Moger.Lunalign_large: 601b3a04ed5SBabu Moger andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 602b3a04ed5SBabu Moger bz %xcc, .Lunalignsrc 603b3a04ed5SBabu Moger sub %o3, 64, %o3 ! %o3 will be multiple of 8 604b3a04ed5SBabu Moger neg %o3 ! bytes until dest is 64 byte aligned 605b3a04ed5SBabu Moger sub %o2, %o3, %o2 ! update cnt with bytes to be moved 606b3a04ed5SBabu Moger ! Move bytes according to source alignment 607b3a04ed5SBabu Moger andcc %o1, 0x1, %o5 608b3a04ed5SBabu Moger bnz %xcc, .Lunalignbyte ! check for byte alignment 609b3a04ed5SBabu Moger nop 610b3a04ed5SBabu Moger andcc %o1, 2, %o5 ! check for half word alignment 611b3a04ed5SBabu Moger bnz %xcc, .Lunalignhalf 612b3a04ed5SBabu Moger nop 613b3a04ed5SBabu Moger ! Src is word aligned 614b3a04ed5SBabu Moger.Lunalignword: 61534060b8fSBabu Moger EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes 616b3a04ed5SBabu Moger add %o1, 8, %o1 ! increase src ptr by 8 61734060b8fSBabu Moger EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4 618b3a04ed5SBabu Moger subcc %o3, 8, %o3 ! decrease count by 8 61934060b8fSBabu Moger EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4 620b3a04ed5SBabu Moger add %o0, 8, %o0 ! increase dst ptr by 8 621b3a04ed5SBabu Moger bnz %xcc, .Lunalignword 62234060b8fSBabu Moger EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4) 623b3a04ed5SBabu Moger ba .Lunalignsrc 624b3a04ed5SBabu Moger nop 625b3a04ed5SBabu Moger 626b3a04ed5SBabu Moger ! Src is half-word aligned 627b3a04ed5SBabu Moger.Lunalignhalf: 62834060b8fSBabu Moger EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes 629b3a04ed5SBabu Moger sllx %o4, 32, %o5 ! shift left 63034060b8fSBabu Moger EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3) 631b3a04ed5SBabu Moger or %o4, %o5, %o5 632b3a04ed5SBabu Moger sllx %o5, 16, %o5 63334060b8fSBabu Moger EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3) 634b3a04ed5SBabu Moger or %o4, %o5, %o5 63534060b8fSBabu Moger EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 636b3a04ed5SBabu Moger add %o1, 8, %o1 637b3a04ed5SBabu Moger subcc %o3, 8, %o3 638b3a04ed5SBabu Moger bnz %xcc, .Lunalignhalf 639b3a04ed5SBabu Moger add %o0, 8, %o0 640b3a04ed5SBabu Moger ba .Lunalignsrc 641b3a04ed5SBabu Moger nop 642b3a04ed5SBabu Moger 643b3a04ed5SBabu Moger ! Src is Byte aligned 644b3a04ed5SBabu Moger.Lunalignbyte: 645b3a04ed5SBabu Moger sub %o0, %o1, %o0 ! share pointer advance 646b3a04ed5SBabu Moger.Lunalignbyte_loop: 64734060b8fSBabu Moger EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3) 648b3a04ed5SBabu Moger sllx %o4, 56, %o5 64934060b8fSBabu Moger EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3) 650b3a04ed5SBabu Moger sllx %o4, 40, %o4 651b3a04ed5SBabu Moger or %o4, %o5, %o5 65234060b8fSBabu Moger EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3) 653b3a04ed5SBabu Moger sllx %o4, 24, %o4 654b3a04ed5SBabu Moger or %o4, %o5, %o5 65534060b8fSBabu Moger EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3) 656b3a04ed5SBabu Moger sllx %o4, 8, %o4 657b3a04ed5SBabu Moger or %o4, %o5, %o5 65834060b8fSBabu Moger EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3) 659b3a04ed5SBabu Moger or %o4, %o5, %o5 660b3a04ed5SBabu Moger add %o0, %o1, %o0 66134060b8fSBabu Moger EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 662b3a04ed5SBabu Moger sub %o0, %o1, %o0 663b3a04ed5SBabu Moger subcc %o3, 8, %o3 664b3a04ed5SBabu Moger bnz %xcc, .Lunalignbyte_loop 665b3a04ed5SBabu Moger add %o1, 8, %o1 666b3a04ed5SBabu Moger add %o0,%o1, %o0 ! restore pointer 667b3a04ed5SBabu Moger 668b3a04ed5SBabu Moger ! Destination is now block (64 byte aligned) 669b3a04ed5SBabu Moger.Lunalignsrc: 670b3a04ed5SBabu Moger andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 671b3a04ed5SBabu Moger and %o2, 0x3f, %o2 ! residue bytes in %o2 672b3a04ed5SBabu Moger add %o2, 64, %o2 ! Insure we do not load beyond 673b3a04ed5SBabu Moger sub %o5, 64, %o5 ! end of source buffer 674b3a04ed5SBabu Moger 675b3a04ed5SBabu Moger andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 676b3a04ed5SBabu Moger alignaddr %o1, %g0, %g0 ! generate %gsr 677b3a04ed5SBabu Moger add %o1, %o5, %o1 ! advance %o1 to after blocks 678b3a04ed5SBabu Moger 67934060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5) 680b3a04ed5SBabu Moger add %o4, 8, %o4 681b3a04ed5SBabu Moger.Lunalign_sloop: 68234060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5) 683b3a04ed5SBabu Moger faligndata %f14, %f16, %f0 68434060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5) 685b3a04ed5SBabu Moger faligndata %f16, %f18, %f2 68634060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5) 687b3a04ed5SBabu Moger faligndata %f18, %f20, %f4 68834060b8fSBabu Moger EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5) 689b3a04ed5SBabu Moger subcc %o5, 64, %o5 69034060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56) 691b3a04ed5SBabu Moger faligndata %f20, %f22, %f6 69234060b8fSBabu Moger EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 69334060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48) 694b3a04ed5SBabu Moger faligndata %f22, %f24, %f8 69534060b8fSBabu Moger EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 69634060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40) 697b3a04ed5SBabu Moger faligndata %f24, %f26, %f10 69834060b8fSBabu Moger EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 69934060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40) 700b3a04ed5SBabu Moger faligndata %f26, %f28, %f12 70134060b8fSBabu Moger EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40) 702b3a04ed5SBabu Moger add %o4, 64, %o4 70334060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40) 704b3a04ed5SBabu Moger faligndata %f28, %f30, %f14 70534060b8fSBabu Moger EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40) 70634060b8fSBabu Moger EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40) 707b3a04ed5SBabu Moger add %o0, 64, %o0 70834060b8fSBabu Moger EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40) 709b3a04ed5SBabu Moger fsrc2 %f30, %f14 710b3a04ed5SBabu Moger bgu,pt %xcc, .Lunalign_sloop 711b3a04ed5SBabu Moger prefetch [%o4 + (8 * BLOCK_SIZE)], 20 712b3a04ed5SBabu Moger 713b3a04ed5SBabu Moger.Lunalign_done: 714b3a04ed5SBabu Moger ! Handle trailing bytes, 64 to 127 715b3a04ed5SBabu Moger ! Dest long word aligned, Src not long word aligned 716b3a04ed5SBabu Moger cmp %o2, 15 717b3a04ed5SBabu Moger bleu %xcc, .Lunalign_short 718b3a04ed5SBabu Moger 719b3a04ed5SBabu Moger andn %o2, 0x7, %o5 ! %o5 is multiple of 8 720b3a04ed5SBabu Moger and %o2, 0x7, %o2 ! residue bytes in %o2 721b3a04ed5SBabu Moger add %o2, 8, %o2 722b3a04ed5SBabu Moger sub %o5, 8, %o5 ! insure we do not load past end of src 723b3a04ed5SBabu Moger andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 724b3a04ed5SBabu Moger add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 72534060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword 726b3a04ed5SBabu Moger.Lunalign_by8: 72734060b8fSBabu Moger EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 728b3a04ed5SBabu Moger add %o4, 8, %o4 729b3a04ed5SBabu Moger faligndata %f0, %f2, %f16 730b3a04ed5SBabu Moger subcc %o5, 8, %o5 73134060b8fSBabu Moger EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5) 732b3a04ed5SBabu Moger fsrc2 %f2, %f0 733b3a04ed5SBabu Moger bgu,pt %xcc, .Lunalign_by8 734b3a04ed5SBabu Moger add %o0, 8, %o0 735b3a04ed5SBabu Moger 736b3a04ed5SBabu Moger.Lunalign_short: 737b3a04ed5SBabu Moger#ifdef NON_USER_COPY 738b3a04ed5SBabu Moger VISExitHalfFast 739b3a04ed5SBabu Moger#else 740b3a04ed5SBabu Moger VISExitHalf 741b3a04ed5SBabu Moger#endif 742b3a04ed5SBabu Moger ba .Lsmallrest 743b3a04ed5SBabu Moger nop 744b3a04ed5SBabu Moger 745b3a04ed5SBabu Moger/* 746b3a04ed5SBabu Moger * This is a special case of nested memcpy. This can happen when kernel 747b3a04ed5SBabu Moger * calls unaligned memcpy back to back without saving FP registers. We need 748b3a04ed5SBabu Moger * traps(context switch) to save/restore FP registers. If the kernel calls 749b3a04ed5SBabu Moger * memcpy without this trap sequence we will hit FP corruption. Let's use 750b3a04ed5SBabu Moger * the normal integer load/store method in this case. 751b3a04ed5SBabu Moger */ 752b3a04ed5SBabu Moger 753b3a04ed5SBabu Moger#ifdef NON_USER_COPY 754b3a04ed5SBabu Moger.Lmedium_vis_entry_fail_cp: 755b3a04ed5SBabu Moger or %o0, %o1, %g2 756b3a04ed5SBabu Moger#endif 757b3a04ed5SBabu Moger.Lmedium_cp: 758b3a04ed5SBabu Moger LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 759b3a04ed5SBabu Moger andcc %g2, 0x7, %g0 760b3a04ed5SBabu Moger bne,pn %xcc, .Lmedium_unaligned_cp 761b3a04ed5SBabu Moger nop 762b3a04ed5SBabu Moger 763b3a04ed5SBabu Moger.Lmedium_noprefetch_cp: 764b3a04ed5SBabu Moger andncc %o2, 0x20 - 1, %o5 765b3a04ed5SBabu Moger be,pn %xcc, 2f 766b3a04ed5SBabu Moger sub %o2, %o5, %o2 76734060b8fSBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 76834060b8fSBabu Moger EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 76934060b8fSBabu Moger EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5) 77034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 771b3a04ed5SBabu Moger add %o1, 0x20, %o1 772b3a04ed5SBabu Moger subcc %o5, 0x20, %o5 77334060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 77434060b8fSBabu Moger EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 77534060b8fSBabu Moger EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 77634060b8fSBabu Moger EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 777b3a04ed5SBabu Moger bne,pt %xcc, 1b 778b3a04ed5SBabu Moger add %o0, 0x20, %o0 779b3a04ed5SBabu Moger2: andcc %o2, 0x18, %o5 780b3a04ed5SBabu Moger be,pt %xcc, 3f 781b3a04ed5SBabu Moger sub %o2, %o5, %o2 78234060b8fSBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 783b3a04ed5SBabu Moger add %o1, 0x08, %o1 784b3a04ed5SBabu Moger add %o0, 0x08, %o0 785b3a04ed5SBabu Moger subcc %o5, 0x08, %o5 786b3a04ed5SBabu Moger bne,pt %xcc, 1b 78734060b8fSBabu Moger EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 788b3a04ed5SBabu Moger3: brz,pt %o2, .Lexit_cp 789b3a04ed5SBabu Moger cmp %o2, 0x04 790b3a04ed5SBabu Moger bl,pn %xcc, .Ltiny_cp 791b3a04ed5SBabu Moger nop 79234060b8fSBabu Moger EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2) 793b3a04ed5SBabu Moger add %o1, 0x04, %o1 794b3a04ed5SBabu Moger add %o0, 0x04, %o0 795b3a04ed5SBabu Moger subcc %o2, 0x04, %o2 796b3a04ed5SBabu Moger bne,pn %xcc, .Ltiny_cp 79734060b8fSBabu Moger EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4) 798b3a04ed5SBabu Moger ba,a,pt %xcc, .Lexit_cp 799b3a04ed5SBabu Moger 800b3a04ed5SBabu Moger.Lmedium_unaligned_cp: 801b3a04ed5SBabu Moger /* First get dest 8 byte aligned. */ 802b3a04ed5SBabu Moger sub %g0, %o0, %o3 803b3a04ed5SBabu Moger and %o3, 0x7, %o3 804b3a04ed5SBabu Moger brz,pt %o3, 2f 805b3a04ed5SBabu Moger sub %o2, %o3, %o2 806b3a04ed5SBabu Moger 80734060b8fSBabu Moger1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 808b3a04ed5SBabu Moger add %o1, 1, %o1 809b3a04ed5SBabu Moger subcc %o3, 1, %o3 810b3a04ed5SBabu Moger add %o0, 1, %o0 811b3a04ed5SBabu Moger bne,pt %xcc, 1b 81234060b8fSBabu Moger EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 813b3a04ed5SBabu Moger2: 814b3a04ed5SBabu Moger and %o1, 0x7, %o3 815b3a04ed5SBabu Moger brz,pn %o3, .Lmedium_noprefetch_cp 816b3a04ed5SBabu Moger sll %o3, 3, %o3 817b3a04ed5SBabu Moger mov 64, %g2 818b3a04ed5SBabu Moger sub %g2, %o3, %g2 819b3a04ed5SBabu Moger andn %o1, 0x7, %o1 82034060b8fSBabu Moger EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 821b3a04ed5SBabu Moger sllx %o4, %o3, %o4 822b3a04ed5SBabu Moger andn %o2, 0x08 - 1, %o5 823b3a04ed5SBabu Moger sub %o2, %o5, %o2 824b3a04ed5SBabu Moger 82534060b8fSBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 826b3a04ed5SBabu Moger add %o1, 0x08, %o1 827b3a04ed5SBabu Moger subcc %o5, 0x08, %o5 828b3a04ed5SBabu Moger srlx %g3, %g2, %g7 829b3a04ed5SBabu Moger or %g7, %o4, %g7 83034060b8fSBabu Moger EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 831b3a04ed5SBabu Moger add %o0, 0x08, %o0 832b3a04ed5SBabu Moger bne,pt %xcc, 1b 833b3a04ed5SBabu Moger sllx %g3, %o3, %o4 834b3a04ed5SBabu Moger srl %o3, 3, %o3 835b3a04ed5SBabu Moger add %o1, %o3, %o1 836b3a04ed5SBabu Moger brz,pn %o2, .Lexit_cp 837b3a04ed5SBabu Moger nop 838b3a04ed5SBabu Moger ba,pt %xcc, .Lsmall_unaligned_cp 839b3a04ed5SBabu Moger 840b3a04ed5SBabu Moger.Ltiny_cp: 84134060b8fSBabu Moger EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 842b3a04ed5SBabu Moger subcc %o2, 1, %o2 843b3a04ed5SBabu Moger be,pn %xcc, .Lexit_cp 84434060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1) 84534060b8fSBabu Moger EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2) 846b3a04ed5SBabu Moger subcc %o2, 1, %o2 847b3a04ed5SBabu Moger be,pn %xcc, .Lexit_cp 84834060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1) 84934060b8fSBabu Moger EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2) 850b3a04ed5SBabu Moger ba,pt %xcc, .Lexit_cp 85134060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2) 852b3a04ed5SBabu Moger 853b3a04ed5SBabu Moger.Lsmall_cp: 854b3a04ed5SBabu Moger andcc %g2, 0x3, %g0 855b3a04ed5SBabu Moger bne,pn %xcc, .Lsmall_unaligned_cp 856b3a04ed5SBabu Moger andn %o2, 0x4 - 1, %o5 857b3a04ed5SBabu Moger sub %o2, %o5, %o2 858b3a04ed5SBabu Moger1: 85934060b8fSBabu Moger EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 860b3a04ed5SBabu Moger add %o1, 0x04, %o1 861b3a04ed5SBabu Moger subcc %o5, 0x04, %o5 862b3a04ed5SBabu Moger add %o0, 0x04, %o0 863b3a04ed5SBabu Moger bne,pt %xcc, 1b 86434060b8fSBabu Moger EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 865b3a04ed5SBabu Moger brz,pt %o2, .Lexit_cp 866b3a04ed5SBabu Moger nop 867b3a04ed5SBabu Moger ba,a,pt %xcc, .Ltiny_cp 868b3a04ed5SBabu Moger 869b3a04ed5SBabu Moger.Lsmall_unaligned_cp: 87034060b8fSBabu Moger1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 871b3a04ed5SBabu Moger add %o1, 1, %o1 872b3a04ed5SBabu Moger add %o0, 1, %o0 873b3a04ed5SBabu Moger subcc %o2, 1, %o2 874b3a04ed5SBabu Moger bne,pt %xcc, 1b 87534060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1) 876b3a04ed5SBabu Moger ba,a,pt %xcc, .Lexit_cp 877b3a04ed5SBabu Moger 878b3a04ed5SBabu Moger.Lsmallrest: 879b3a04ed5SBabu Moger tst %o2 880b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx 881b3a04ed5SBabu Moger cmp %o2, 4 882b3a04ed5SBabu Moger blt,pn %xcc, .Lsmallleft3 883b3a04ed5SBabu Moger nop 884b3a04ed5SBabu Moger sub %o2, 3, %o2 885b3a04ed5SBabu Moger.Lsmallnotalign4: 88634060b8fSBabu Moger EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte 887b3a04ed5SBabu Moger subcc %o2, 4, %o2 ! reduce count by 4 88834060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat 88934060b8fSBabu Moger EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4 890b3a04ed5SBabu Moger add %o1, 4, %o1 ! advance SRC by 4 89134060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6) 89234060b8fSBabu Moger EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5) 893b3a04ed5SBabu Moger add %o0, 4, %o0 ! advance DST by 4 89434060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5) 89534060b8fSBabu Moger EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4) 896b3a04ed5SBabu Moger bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain 89734060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4) 898b3a04ed5SBabu Moger addcc %o2, 3, %o2 ! restore count 899b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx 900b3a04ed5SBabu Moger.Lsmallleft3: ! 1, 2, or 3 bytes remain 901b3a04ed5SBabu Moger subcc %o2, 1, %o2 90234060b8fSBabu Moger EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte 903b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx 90434060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte 90534060b8fSBabu Moger EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte 906b3a04ed5SBabu Moger subcc %o2, 1, %o2 907b3a04ed5SBabu Moger bz,pt %xcc, .Lsmallx 90834060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte 90934060b8fSBabu Moger EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte 91034060b8fSBabu Moger EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte 911b3a04ed5SBabu Moger.Lsmallx: 912b3a04ed5SBabu Moger retl 913b3a04ed5SBabu Moger mov EX_RETVAL(%g1), %o0 914b3a04ed5SBabu Moger.Lsmallfin: 915b3a04ed5SBabu Moger tst %o2 916b3a04ed5SBabu Moger bnz,pn %xcc, .Lsmallleft3 917b3a04ed5SBabu Moger nop 918b3a04ed5SBabu Moger retl 919b3a04ed5SBabu Moger mov EX_RETVAL(%g1), %o0 ! restore %o0 920b3a04ed5SBabu Moger.Lexit_cp: 921b3a04ed5SBabu Moger retl 922b3a04ed5SBabu Moger mov EX_RETVAL(%g1), %o0 923b3a04ed5SBabu Moger .size FUNC_NAME, .-FUNC_NAME 924