1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* 31da177e4SLinus Torvalds * 41da177e4SLinus Torvalds * Optimized version of the standard memcpy() function 51da177e4SLinus Torvalds * 61da177e4SLinus Torvalds * Inputs: 71da177e4SLinus Torvalds * in0: destination address 81da177e4SLinus Torvalds * in1: source address 91da177e4SLinus Torvalds * in2: number of bytes to copy 101da177e4SLinus Torvalds * Output: 111da177e4SLinus Torvalds * no return value 121da177e4SLinus Torvalds * 131da177e4SLinus Torvalds * Copyright (C) 2000-2001 Hewlett-Packard Co 141da177e4SLinus Torvalds * Stephane Eranian <eranian@hpl.hp.com> 151da177e4SLinus Torvalds * David Mosberger-Tang <davidm@hpl.hp.com> 161da177e4SLinus Torvalds */ 17*ab03e604SMasahiro Yamada#include <linux/export.h> 181da177e4SLinus Torvalds#include <asm/asmmacro.h> 191da177e4SLinus Torvalds 201da177e4SLinus TorvaldsGLOBAL_ENTRY(memcpy) 211da177e4SLinus Torvalds 221da177e4SLinus Torvalds# define MEM_LAT 21 /* latency to memory */ 231da177e4SLinus Torvalds 241da177e4SLinus Torvalds# define dst r2 251da177e4SLinus Torvalds# define src r3 261da177e4SLinus Torvalds# define retval r8 271da177e4SLinus Torvalds# define saved_pfs r9 281da177e4SLinus Torvalds# define saved_lc r10 291da177e4SLinus Torvalds# define saved_pr r11 301da177e4SLinus Torvalds# define cnt r16 311da177e4SLinus Torvalds# define src2 r17 321da177e4SLinus Torvalds# define t0 r18 331da177e4SLinus Torvalds# define t1 r19 341da177e4SLinus Torvalds# define t2 r20 351da177e4SLinus Torvalds# define t3 r21 361da177e4SLinus Torvalds# define t4 r22 371da177e4SLinus Torvalds# define src_end r23 381da177e4SLinus Torvalds 391da177e4SLinus Torvalds# define N (MEM_LAT + 4) 401da177e4SLinus Torvalds# define Nrot ((N + 7) & ~7) 411da177e4SLinus Torvalds 421da177e4SLinus Torvalds /* 431da177e4SLinus Torvalds * First, check if everything (src, dst, len) is a multiple of eight. If 441da177e4SLinus Torvalds * so, we handle everything with no taken branches (other than the loop 451da177e4SLinus Torvalds * itself) and a small icache footprint. Otherwise, we jump off to 461da177e4SLinus Torvalds * the more general copy routine handling arbitrary 471da177e4SLinus Torvalds * sizes/alignment etc. 481da177e4SLinus Torvalds */ 491da177e4SLinus Torvalds .prologue 501da177e4SLinus Torvalds .save ar.pfs, saved_pfs 511da177e4SLinus Torvalds alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot 521da177e4SLinus Torvalds .save ar.lc, saved_lc 531da177e4SLinus Torvalds mov saved_lc=ar.lc 541da177e4SLinus Torvalds or t0=in0,in1 551da177e4SLinus Torvalds ;; 561da177e4SLinus Torvalds 571da177e4SLinus Torvalds or t0=t0,in2 581da177e4SLinus Torvalds .save pr, saved_pr 591da177e4SLinus Torvalds mov saved_pr=pr 601da177e4SLinus Torvalds 611da177e4SLinus Torvalds .body 621da177e4SLinus Torvalds 631da177e4SLinus Torvalds cmp.eq p6,p0=in2,r0 // zero length? 641da177e4SLinus Torvalds mov retval=in0 // return dst 651da177e4SLinus Torvalds(p6) br.ret.spnt.many rp // zero length, return immediately 661da177e4SLinus Torvalds ;; 671da177e4SLinus Torvalds 681da177e4SLinus Torvalds mov dst=in0 // copy because of rotation 691da177e4SLinus Torvalds shr.u cnt=in2,3 // number of 8-byte words to copy 701da177e4SLinus Torvalds mov pr.rot=1<<16 711da177e4SLinus Torvalds ;; 721da177e4SLinus Torvalds 731da177e4SLinus Torvalds adds cnt=-1,cnt // br.ctop is repeat/until 741da177e4SLinus Torvalds cmp.gtu p7,p0=16,in2 // copying less than 16 bytes? 751da177e4SLinus Torvalds mov ar.ec=N 761da177e4SLinus Torvalds ;; 771da177e4SLinus Torvalds 781da177e4SLinus Torvalds and t0=0x7,t0 791da177e4SLinus Torvalds mov ar.lc=cnt 801da177e4SLinus Torvalds ;; 811da177e4SLinus Torvalds cmp.ne p6,p0=t0,r0 821da177e4SLinus Torvalds 831da177e4SLinus Torvalds mov src=in1 // copy because of rotation 841da177e4SLinus Torvalds(p7) br.cond.spnt.few .memcpy_short 851da177e4SLinus Torvalds(p6) br.cond.spnt.few .memcpy_long 861da177e4SLinus Torvalds ;; 871da177e4SLinus Torvalds nop.m 0 881da177e4SLinus Torvalds ;; 891da177e4SLinus Torvalds nop.m 0 901da177e4SLinus Torvalds nop.i 0 911da177e4SLinus Torvalds ;; 921da177e4SLinus Torvalds nop.m 0 931da177e4SLinus Torvalds ;; 941da177e4SLinus Torvalds .rotr val[N] 951da177e4SLinus Torvalds .rotp p[N] 961da177e4SLinus Torvalds .align 32 971da177e4SLinus Torvalds1: { .mib 981da177e4SLinus Torvalds(p[0]) ld8 val[0]=[src],8 991da177e4SLinus Torvalds nop.i 0 1001da177e4SLinus Torvalds brp.loop.imp 1b, 2f 1011da177e4SLinus Torvalds} 1021da177e4SLinus Torvalds2: { .mfb 1031da177e4SLinus Torvalds(p[N-1])st8 [dst]=val[N-1],8 1041da177e4SLinus Torvalds nop.f 0 1051da177e4SLinus Torvalds br.ctop.dptk.few 1b 1061da177e4SLinus Torvalds} 1071da177e4SLinus Torvalds ;; 1081da177e4SLinus Torvalds mov ar.lc=saved_lc 1091da177e4SLinus Torvalds mov pr=saved_pr,-1 1101da177e4SLinus Torvalds mov ar.pfs=saved_pfs 1111da177e4SLinus Torvalds br.ret.sptk.many rp 1121da177e4SLinus Torvalds 1131da177e4SLinus Torvalds /* 1141da177e4SLinus Torvalds * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time 1151da177e4SLinus Torvalds * copy loop. This performs relatively poorly on Itanium, but it doesn't 1161da177e4SLinus Torvalds * get used very often (gcc inlines small copies) and due to atomicity 1171da177e4SLinus Torvalds * issues, we want to avoid read-modify-write of entire words. 1181da177e4SLinus Torvalds */ 1191da177e4SLinus Torvalds .align 32 1201da177e4SLinus Torvalds.memcpy_short: 1211da177e4SLinus Torvalds adds cnt=-1,in2 // br.ctop is repeat/until 1221da177e4SLinus Torvalds mov ar.ec=MEM_LAT 1231da177e4SLinus Torvalds brp.loop.imp 1f, 2f 1241da177e4SLinus Torvalds ;; 1251da177e4SLinus Torvalds mov ar.lc=cnt 1261da177e4SLinus Torvalds ;; 1271da177e4SLinus Torvalds nop.m 0 1281da177e4SLinus Torvalds ;; 1291da177e4SLinus Torvalds nop.m 0 1301da177e4SLinus Torvalds nop.i 0 1311da177e4SLinus Torvalds ;; 1321da177e4SLinus Torvalds nop.m 0 1331da177e4SLinus Torvalds ;; 1341da177e4SLinus Torvalds nop.m 0 1351da177e4SLinus Torvalds ;; 1361da177e4SLinus Torvalds /* 1371da177e4SLinus Torvalds * It is faster to put a stop bit in the loop here because it makes 1381da177e4SLinus Torvalds * the pipeline shorter (and latency is what matters on short copies). 1391da177e4SLinus Torvalds */ 1401da177e4SLinus Torvalds .align 32 1411da177e4SLinus Torvalds1: { .mib 1421da177e4SLinus Torvalds(p[0]) ld1 val[0]=[src],1 1431da177e4SLinus Torvalds nop.i 0 1441da177e4SLinus Torvalds brp.loop.imp 1b, 2f 1451da177e4SLinus Torvalds} ;; 1461da177e4SLinus Torvalds2: { .mfb 1471da177e4SLinus Torvalds(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1 1481da177e4SLinus Torvalds nop.f 0 1491da177e4SLinus Torvalds br.ctop.dptk.few 1b 1501da177e4SLinus Torvalds} ;; 1511da177e4SLinus Torvalds mov ar.lc=saved_lc 1521da177e4SLinus Torvalds mov pr=saved_pr,-1 1531da177e4SLinus Torvalds mov ar.pfs=saved_pfs 1541da177e4SLinus Torvalds br.ret.sptk.many rp 1551da177e4SLinus Torvalds 1561da177e4SLinus Torvalds /* 1571da177e4SLinus Torvalds * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't 1581da177e4SLinus Torvalds * an overriding concern here, but throughput is. We first do 1591da177e4SLinus Torvalds * sub-word copying until the destination is aligned, then we check 1601da177e4SLinus Torvalds * if the source is also aligned. If so, we do a simple load/store-loop 1611da177e4SLinus Torvalds * until there are less than 8 bytes left over and then we do the tail, 1621da177e4SLinus Torvalds * by storing the last few bytes using sub-word copying. If the source 1631da177e4SLinus Torvalds * is not aligned, we branch off to the non-congruent loop. 1641da177e4SLinus Torvalds * 1651da177e4SLinus Torvalds * stage: op: 1661da177e4SLinus Torvalds * 0 ld 1671da177e4SLinus Torvalds * : 1681da177e4SLinus Torvalds * MEM_LAT+3 shrp 1691da177e4SLinus Torvalds * MEM_LAT+4 st 1701da177e4SLinus Torvalds * 1711da177e4SLinus Torvalds * On Itanium, the pipeline itself runs without stalls. However, br.ctop 1721da177e4SLinus Torvalds * seems to introduce an unavoidable bubble in the pipeline so the overall 1731da177e4SLinus Torvalds * latency is 2 cycles/iteration. This gives us a _copy_ throughput 1741da177e4SLinus Torvalds * of 4 byte/cycle. Still not bad. 1751da177e4SLinus Torvalds */ 1761da177e4SLinus Torvalds# undef N 1771da177e4SLinus Torvalds# undef Nrot 1781da177e4SLinus Torvalds# define N (MEM_LAT + 5) /* number of stages */ 1791da177e4SLinus Torvalds# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */ 1801da177e4SLinus Torvalds 1811da177e4SLinus Torvalds#define LOG_LOOP_SIZE 6 1821da177e4SLinus Torvalds 1831da177e4SLinus Torvalds.memcpy_long: 1841da177e4SLinus Torvalds alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame 1851da177e4SLinus Torvalds and t0=-8,src // t0 = src & ~7 1861da177e4SLinus Torvalds and t2=7,src // t2 = src & 7 1871da177e4SLinus Torvalds ;; 1881da177e4SLinus Torvalds ld8 t0=[t0] // t0 = 1st source word 1891da177e4SLinus Torvalds adds src2=7,src // src2 = (src + 7) 1901da177e4SLinus Torvalds sub t4=r0,dst // t4 = -dst 1911da177e4SLinus Torvalds ;; 1921da177e4SLinus Torvalds and src2=-8,src2 // src2 = (src + 7) & ~7 1931da177e4SLinus Torvalds shl t2=t2,3 // t2 = 8*(src & 7) 1941da177e4SLinus Torvalds shl t4=t4,3 // t4 = 8*(dst & 7) 1951da177e4SLinus Torvalds ;; 1961da177e4SLinus Torvalds ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise 1971da177e4SLinus Torvalds sub t3=64,t2 // t3 = 64-8*(src & 7) 1981da177e4SLinus Torvalds shr.u t0=t0,t2 1991da177e4SLinus Torvalds ;; 2001da177e4SLinus Torvalds add src_end=src,in2 2011da177e4SLinus Torvalds shl t1=t1,t3 2021da177e4SLinus Torvalds mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7) 2031da177e4SLinus Torvalds ;; 2041da177e4SLinus Torvalds or t0=t0,t1 2051da177e4SLinus Torvalds mov cnt=r0 2061da177e4SLinus Torvalds adds src_end=-1,src_end 2071da177e4SLinus Torvalds ;; 2081da177e4SLinus Torvalds(p3) st1 [dst]=t0,1 2091da177e4SLinus Torvalds(p3) shr.u t0=t0,8 2101da177e4SLinus Torvalds(p3) adds cnt=1,cnt 2111da177e4SLinus Torvalds ;; 2121da177e4SLinus Torvalds(p4) st2 [dst]=t0,2 2131da177e4SLinus Torvalds(p4) shr.u t0=t0,16 2141da177e4SLinus Torvalds(p4) adds cnt=2,cnt 2151da177e4SLinus Torvalds ;; 2161da177e4SLinus Torvalds(p5) st4 [dst]=t0,4 2171da177e4SLinus Torvalds(p5) adds cnt=4,cnt 2181da177e4SLinus Torvalds and src_end=-8,src_end // src_end = last word of source buffer 2191da177e4SLinus Torvalds ;; 2201da177e4SLinus Torvalds 2211da177e4SLinus Torvalds // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy: 2221da177e4SLinus Torvalds 2231da177e4SLinus Torvalds1:{ add src=cnt,src // make src point to remainder of source buffer 2241da177e4SLinus Torvalds sub cnt=in2,cnt // cnt = number of bytes left to copy 2251da177e4SLinus Torvalds mov t4=ip 2261da177e4SLinus Torvalds } ;; 2271da177e4SLinus Torvalds and src2=-8,src // align source pointer 2281da177e4SLinus Torvalds adds t4=.memcpy_loops-1b,t4 2291da177e4SLinus Torvalds mov ar.ec=N 2301da177e4SLinus Torvalds 2311da177e4SLinus Torvalds and t0=7,src // t0 = src & 7 2321da177e4SLinus Torvalds shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy 2331da177e4SLinus Torvalds shl cnt=cnt,3 // move bits 0-2 to 3-5 2341da177e4SLinus Torvalds ;; 2351da177e4SLinus Torvalds 2361da177e4SLinus Torvalds .rotr val[N+1], w[2] 2371da177e4SLinus Torvalds .rotp p[N] 2381da177e4SLinus Torvalds 2391da177e4SLinus Torvalds cmp.ne p6,p0=t0,r0 // is src aligned, too? 2401da177e4SLinus Torvalds shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7) 2411da177e4SLinus Torvalds adds t2=-1,t2 // br.ctop is repeat/until 2421da177e4SLinus Torvalds ;; 2431da177e4SLinus Torvalds add t4=t0,t4 2441da177e4SLinus Torvalds mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy 2451da177e4SLinus Torvalds mov ar.lc=t2 2461da177e4SLinus Torvalds ;; 2471da177e4SLinus Torvalds nop.m 0 2481da177e4SLinus Torvalds ;; 2491da177e4SLinus Torvalds nop.m 0 2501da177e4SLinus Torvalds nop.i 0 2511da177e4SLinus Torvalds ;; 2521da177e4SLinus Torvalds nop.m 0 2531da177e4SLinus Torvalds ;; 2541da177e4SLinus Torvalds(p6) ld8 val[1]=[src2],8 // prime the pump... 2551da177e4SLinus Torvalds mov b6=t4 2561da177e4SLinus Torvalds br.sptk.few b6 2571da177e4SLinus Torvalds ;; 2581da177e4SLinus Torvalds 2591da177e4SLinus Torvalds.memcpy_tail: 2601da177e4SLinus Torvalds // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is 2611da177e4SLinus Torvalds // less than 8) and t0 contains the last few bytes of the src buffer: 2621da177e4SLinus Torvalds(p5) st4 [dst]=t0,4 2631da177e4SLinus Torvalds(p5) shr.u t0=t0,32 2641da177e4SLinus Torvalds mov ar.lc=saved_lc 2651da177e4SLinus Torvalds ;; 2661da177e4SLinus Torvalds(p4) st2 [dst]=t0,2 2671da177e4SLinus Torvalds(p4) shr.u t0=t0,16 2681da177e4SLinus Torvalds mov ar.pfs=saved_pfs 2691da177e4SLinus Torvalds ;; 2701da177e4SLinus Torvalds(p3) st1 [dst]=t0 2711da177e4SLinus Torvalds mov pr=saved_pr,-1 2721da177e4SLinus Torvalds br.ret.sptk.many rp 2731da177e4SLinus Torvalds 2741da177e4SLinus Torvalds/////////////////////////////////////////////////////// 2751da177e4SLinus Torvalds .align 64 2761da177e4SLinus Torvalds 2771da177e4SLinus Torvalds#define COPY(shift,index) \ 2781da177e4SLinus Torvalds 1: { .mib \ 2791da177e4SLinus Torvalds (p[0]) ld8 val[0]=[src2],8; \ 2801da177e4SLinus Torvalds (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \ 2811da177e4SLinus Torvalds brp.loop.imp 1b, 2f \ 2821da177e4SLinus Torvalds }; \ 2831da177e4SLinus Torvalds 2: { .mfb \ 2841da177e4SLinus Torvalds (p[MEM_LAT+4]) st8 [dst]=w[1],8; \ 2851da177e4SLinus Torvalds nop.f 0; \ 2861da177e4SLinus Torvalds br.ctop.dptk.few 1b; \ 2871da177e4SLinus Torvalds }; \ 2881da177e4SLinus Torvalds ;; \ 2891da177e4SLinus Torvalds ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \ 2901da177e4SLinus Torvalds ;; \ 2911da177e4SLinus Torvalds shrp t0=val[N-1],val[N-index],shift; \ 2921da177e4SLinus Torvalds br .memcpy_tail 2931da177e4SLinus Torvalds.memcpy_loops: 2941da177e4SLinus Torvalds COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */ 2951da177e4SLinus Torvalds COPY(8, 0) 2961da177e4SLinus Torvalds COPY(16, 0) 2971da177e4SLinus Torvalds COPY(24, 0) 2981da177e4SLinus Torvalds COPY(32, 0) 2991da177e4SLinus Torvalds COPY(40, 0) 3001da177e4SLinus Torvalds COPY(48, 0) 3011da177e4SLinus Torvalds COPY(56, 0) 3021da177e4SLinus Torvalds 3031da177e4SLinus TorvaldsEND(memcpy) 304e007c533SAl ViroEXPORT_SYMBOL(memcpy) 305