1/* NG4memcpy.S: Niagara-4 optimized memcpy. 2 * 3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net) 4 */ 5 6#ifdef __KERNEL__ 7#include <asm/visasm.h> 8#include <asm/asi.h> 9#define GLOBAL_SPARE %g7 10#else 11#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 12#define FPRS_FEF 0x04 13 14/* On T4 it is very expensive to access ASRs like %fprs and 15 * %asi, avoiding a read or a write can save ~50 cycles. 16 */ 17#define FPU_ENTER \ 18 rd %fprs, %o5; \ 19 andcc %o5, FPRS_FEF, %g0; \ 20 be,a,pn %icc, 999f; \ 21 wr %g0, FPRS_FEF, %fprs; \ 22 999: 23 24#ifdef MEMCPY_DEBUG 25#define VISEntryHalf FPU_ENTER; \ 26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; 27#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 28#else 29#define VISEntryHalf FPU_ENTER 30#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 31#endif 32 33#define GLOBAL_SPARE %g5 34#endif 35 36#ifndef STORE_ASI 37#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 38#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 39#else 40#define STORE_ASI 0x80 /* ASI_P */ 41#endif 42#endif 43 44#if !defined(EX_LD) && !defined(EX_ST) 45#define NON_USER_COPY 46#endif 47 48#ifndef EX_LD 49#define EX_LD(x) x 50#endif 51#ifndef EX_LD_FP 52#define EX_LD_FP(x) x 53#endif 54 55#ifndef EX_ST 56#define EX_ST(x) x 57#endif 58#ifndef EX_ST_FP 59#define EX_ST_FP(x) x 60#endif 61 62#ifndef EX_RETVAL 63#define EX_RETVAL(x) x 64#endif 65 66#ifndef LOAD 67#define LOAD(type,addr,dest) type [addr], dest 68#endif 69 70#ifndef STORE 71#ifndef MEMCPY_DEBUG 72#define STORE(type,src,addr) type src, [addr] 73#else 74#define STORE(type,src,addr) type##a src, [addr] %asi 75#endif 76#endif 77 78#ifndef STORE_INIT 79#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 80#endif 81 82#ifndef FUNC_NAME 83#define FUNC_NAME NG4memcpy 84#endif 85#ifndef PREAMBLE 86#define PREAMBLE 87#endif 88 89#ifndef XCC 90#define XCC xcc 91#endif 92 93 .register %g2,#scratch 94 .register %g3,#scratch 95 96 .text 97 .align 64 98 99 .globl FUNC_NAME 100 .type FUNC_NAME,#function 101FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 102#ifdef MEMCPY_DEBUG 103 wr %g0, 0x80, %asi 104#endif 105 srlx %o2, 31, %g2 106 cmp %g2, 0 107 tne %XCC, 5 108 PREAMBLE 109 mov %o0, %o3 110 brz,pn %o2, .Lexit 111 cmp %o2, 3 112 ble,pn %icc, .Ltiny 113 cmp %o2, 19 114 ble,pn %icc, .Lsmall 115 or %o0, %o1, %g2 116 cmp %o2, 128 117 bl,pn %icc, .Lmedium 118 nop 119 120.Llarge:/* len >= 0x80 */ 121 /* First get dest 8 byte aligned. */ 122 sub %g0, %o0, %g1 123 and %g1, 0x7, %g1 124 brz,pt %g1, 51f 125 sub %o2, %g1, %o2 126 1271: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) 128 add %o1, 1, %o1 129 subcc %g1, 1, %g1 130 add %o0, 1, %o0 131 bne,pt %icc, 1b 132 EX_ST(STORE(stb, %g2, %o0 - 0x01)) 133 13451: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) 135 LOAD(prefetch, %o1 + 0x080, #n_reads_strong) 136 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) 137 LOAD(prefetch, %o1 + 0x100, #n_reads_strong) 138 LOAD(prefetch, %o1 + 0x140, #n_reads_strong) 139 LOAD(prefetch, %o1 + 0x180, #n_reads_strong) 140 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) 141 LOAD(prefetch, %o1 + 0x200, #n_reads_strong) 142 143 /* Check if we can use the straight fully aligned 144 * loop, or we require the alignaddr/faligndata variant. 145 */ 146 andcc %o1, 0x7, %o5 147 bne,pn %icc, .Llarge_src_unaligned 148 sub %g0, %o0, %g1 149 150 /* Legitimize the use of initializing stores by getting dest 151 * to be 64-byte aligned. 152 */ 153 and %g1, 0x3f, %g1 154 brz,pt %g1, .Llarge_aligned 155 sub %o2, %g1, %o2 156 1571: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) 158 add %o1, 8, %o1 159 subcc %g1, 8, %g1 160 add %o0, 8, %o0 161 bne,pt %icc, 1b 162 EX_ST(STORE(stx, %g2, %o0 - 0x08)) 163 164.Llarge_aligned: 165 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ 166 andn %o2, 0x3f, %o4 167 sub %o2, %o4, %o2 168 1691: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) 170 add %o1, 0x40, %o1 171 EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) 172 subcc %o4, 0x40, %o4 173 EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) 174 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) 175 EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) 176 EX_ST(STORE_INIT(%g1, %o0)) 177 add %o0, 0x08, %o0 178 EX_ST(STORE_INIT(%g2, %o0)) 179 add %o0, 0x08, %o0 180 EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) 181 EX_ST(STORE_INIT(%g3, %o0)) 182 add %o0, 0x08, %o0 183 EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) 184 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) 185 add %o0, 0x08, %o0 186 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) 187 EX_ST(STORE_INIT(%o5, %o0)) 188 add %o0, 0x08, %o0 189 EX_ST(STORE_INIT(%g2, %o0)) 190 add %o0, 0x08, %o0 191 EX_ST(STORE_INIT(%g3, %o0)) 192 add %o0, 0x08, %o0 193 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) 194 add %o0, 0x08, %o0 195 bne,pt %icc, 1b 196 LOAD(prefetch, %o1 + 0x200, #n_reads_strong) 197 198 membar #StoreLoad | #StoreStore 199 200 brz,pn %o2, .Lexit 201 cmp %o2, 19 202 ble,pn %icc, .Lsmall_unaligned 203 nop 204 ba,a,pt %icc, .Lmedium_noprefetch 205 206.Lexit: retl 207 mov EX_RETVAL(%o3), %o0 208 209.Llarge_src_unaligned: 210#ifdef NON_USER_COPY 211 VISEntryHalfFast(.Lmedium_vis_entry_fail) 212#else 213 VISEntryHalf 214#endif 215 andn %o2, 0x3f, %o4 216 sub %o2, %o4, %o2 217 alignaddr %o1, %g0, %g1 218 add %o1, %o4, %o1 219 EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0)) 2201: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2)) 221 subcc %o4, 0x40, %o4 222 EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4)) 223 EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6)) 224 EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8)) 225 EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10)) 226 EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12)) 227 EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14)) 228 faligndata %f0, %f2, %f16 229 EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0)) 230 faligndata %f2, %f4, %f18 231 add %g1, 0x40, %g1 232 faligndata %f4, %f6, %f20 233 faligndata %f6, %f8, %f22 234 faligndata %f8, %f10, %f24 235 faligndata %f10, %f12, %f26 236 faligndata %f12, %f14, %f28 237 faligndata %f14, %f0, %f30 238 EX_ST_FP(STORE(std, %f16, %o0 + 0x00)) 239 EX_ST_FP(STORE(std, %f18, %o0 + 0x08)) 240 EX_ST_FP(STORE(std, %f20, %o0 + 0x10)) 241 EX_ST_FP(STORE(std, %f22, %o0 + 0x18)) 242 EX_ST_FP(STORE(std, %f24, %o0 + 0x20)) 243 EX_ST_FP(STORE(std, %f26, %o0 + 0x28)) 244 EX_ST_FP(STORE(std, %f28, %o0 + 0x30)) 245 EX_ST_FP(STORE(std, %f30, %o0 + 0x38)) 246 add %o0, 0x40, %o0 247 bne,pt %icc, 1b 248 LOAD(prefetch, %g1 + 0x200, #n_reads_strong) 249#ifdef NON_USER_COPY 250 VISExitHalfFast 251#else 252 VISExitHalf 253#endif 254 brz,pn %o2, .Lexit 255 cmp %o2, 19 256 ble,pn %icc, .Lsmall_unaligned 257 nop 258 ba,a,pt %icc, .Lmedium_unaligned 259 260#ifdef NON_USER_COPY 261.Lmedium_vis_entry_fail: 262 or %o0, %o1, %g2 263#endif 264.Lmedium: 265 LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 266 andcc %g2, 0x7, %g0 267 bne,pn %icc, .Lmedium_unaligned 268 nop 269.Lmedium_noprefetch: 270 andncc %o2, 0x20 - 1, %o5 271 be,pn %icc, 2f 272 sub %o2, %o5, %o2 2731: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) 274 EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) 275 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) 276 EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) 277 add %o1, 0x20, %o1 278 subcc %o5, 0x20, %o5 279 EX_ST(STORE(stx, %g1, %o0 + 0x00)) 280 EX_ST(STORE(stx, %g2, %o0 + 0x08)) 281 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) 282 EX_ST(STORE(stx, %o4, %o0 + 0x18)) 283 bne,pt %icc, 1b 284 add %o0, 0x20, %o0 2852: andcc %o2, 0x18, %o5 286 be,pt %icc, 3f 287 sub %o2, %o5, %o2 2881: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) 289 add %o1, 0x08, %o1 290 add %o0, 0x08, %o0 291 subcc %o5, 0x08, %o5 292 bne,pt %icc, 1b 293 EX_ST(STORE(stx, %g1, %o0 - 0x08)) 2943: brz,pt %o2, .Lexit 295 cmp %o2, 0x04 296 bl,pn %icc, .Ltiny 297 nop 298 EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) 299 add %o1, 0x04, %o1 300 add %o0, 0x04, %o0 301 subcc %o2, 0x04, %o2 302 bne,pn %icc, .Ltiny 303 EX_ST(STORE(stw, %g1, %o0 - 0x04)) 304 ba,a,pt %icc, .Lexit 305.Lmedium_unaligned: 306 /* First get dest 8 byte aligned. */ 307 sub %g0, %o0, %g1 308 and %g1, 0x7, %g1 309 brz,pt %g1, 2f 310 sub %o2, %g1, %o2 311 3121: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) 313 add %o1, 1, %o1 314 subcc %g1, 1, %g1 315 add %o0, 1, %o0 316 bne,pt %icc, 1b 317 EX_ST(STORE(stb, %g2, %o0 - 0x01)) 3182: 319 and %o1, 0x7, %g1 320 brz,pn %g1, .Lmedium_noprefetch 321 sll %g1, 3, %g1 322 mov 64, %g2 323 sub %g2, %g1, %g2 324 andn %o1, 0x7, %o1 325 EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) 326 sllx %o4, %g1, %o4 327 andn %o2, 0x08 - 1, %o5 328 sub %o2, %o5, %o2 3291: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) 330 add %o1, 0x08, %o1 331 subcc %o5, 0x08, %o5 332 srlx %g3, %g2, GLOBAL_SPARE 333 or GLOBAL_SPARE, %o4, GLOBAL_SPARE 334 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) 335 add %o0, 0x08, %o0 336 bne,pt %icc, 1b 337 sllx %g3, %g1, %o4 338 srl %g1, 3, %g1 339 add %o1, %g1, %o1 340 brz,pn %o2, .Lexit 341 nop 342 ba,pt %icc, .Lsmall_unaligned 343 344.Ltiny: 345 EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) 346 subcc %o2, 1, %o2 347 be,pn %icc, .Lexit 348 EX_ST(STORE(stb, %g1, %o0 + 0x00)) 349 EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) 350 subcc %o2, 1, %o2 351 be,pn %icc, .Lexit 352 EX_ST(STORE(stb, %g1, %o0 + 0x01)) 353 EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) 354 ba,pt %icc, .Lexit 355 EX_ST(STORE(stb, %g1, %o0 + 0x02)) 356 357.Lsmall: 358 andcc %g2, 0x3, %g0 359 bne,pn %icc, .Lsmall_unaligned 360 andn %o2, 0x4 - 1, %o5 361 sub %o2, %o5, %o2 3621: 363 EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) 364 add %o1, 0x04, %o1 365 subcc %o5, 0x04, %o5 366 add %o0, 0x04, %o0 367 bne,pt %icc, 1b 368 EX_ST(STORE(stw, %g1, %o0 - 0x04)) 369 brz,pt %o2, .Lexit 370 nop 371 ba,a,pt %icc, .Ltiny 372 373.Lsmall_unaligned: 3741: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) 375 add %o1, 1, %o1 376 add %o0, 1, %o0 377 subcc %o2, 1, %o2 378 bne,pt %icc, 1b 379 EX_ST(STORE(stb, %g1, %o0 - 0x01)) 380 ba,a,pt %icc, .Lexit 381 .size FUNC_NAME, .-FUNC_NAME 382