xref: /openbmc/linux/arch/ia64/lib/memset.S (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */
21da177e4SLinus Torvalds/* Optimized version of the standard memset() function.
31da177e4SLinus Torvalds
41da177e4SLinus Torvalds   Copyright (c) 2002 Hewlett-Packard Co/CERN
51da177e4SLinus Torvalds	Sverre Jarp <Sverre.Jarp@cern.ch>
61da177e4SLinus Torvalds
71da177e4SLinus Torvalds   Return: dest
81da177e4SLinus Torvalds
91da177e4SLinus Torvalds   Inputs:
101da177e4SLinus Torvalds        in0:    dest
111da177e4SLinus Torvalds        in1:    value
121da177e4SLinus Torvalds        in2:    count
131da177e4SLinus Torvalds
141da177e4SLinus Torvalds   The algorithm is fairly straightforward: set byte by byte until we
151da177e4SLinus Torvalds   we get to a 16B-aligned address, then loop on 128 B chunks using an
161da177e4SLinus Torvalds   early store as prefetching, then loop on 32B chucks, then clear remaining
171da177e4SLinus Torvalds   words, finally clear remaining bytes.
181da177e4SLinus Torvalds   Since a stf.spill f0 can store 16B in one go, we use this instruction
191da177e4SLinus Torvalds   to get peak speed when value = 0.  */
201da177e4SLinus Torvalds
21*ab03e604SMasahiro Yamada#include <linux/export.h>
221da177e4SLinus Torvalds#include <asm/asmmacro.h>
231da177e4SLinus Torvalds#undef ret
241da177e4SLinus Torvalds
251da177e4SLinus Torvalds#define dest		in0
261da177e4SLinus Torvalds#define value		in1
271da177e4SLinus Torvalds#define	cnt		in2
281da177e4SLinus Torvalds
291da177e4SLinus Torvalds#define tmp		r31
301da177e4SLinus Torvalds#define save_lc		r30
311da177e4SLinus Torvalds#define ptr0		r29
321da177e4SLinus Torvalds#define ptr1		r28
331da177e4SLinus Torvalds#define ptr2		r27
341da177e4SLinus Torvalds#define ptr3		r26
351da177e4SLinus Torvalds#define ptr9 		r24
361da177e4SLinus Torvalds#define	loopcnt		r23
371da177e4SLinus Torvalds#define linecnt		r22
381da177e4SLinus Torvalds#define bytecnt		r21
391da177e4SLinus Torvalds
401da177e4SLinus Torvalds#define fvalue		f6
411da177e4SLinus Torvalds
421da177e4SLinus Torvalds// This routine uses only scratch predicate registers (p6 - p15)
431da177e4SLinus Torvalds#define p_scr		p6			// default register for same-cycle branches
441da177e4SLinus Torvalds#define p_nz		p7
451da177e4SLinus Torvalds#define p_zr		p8
461da177e4SLinus Torvalds#define p_unalgn	p9
471da177e4SLinus Torvalds#define p_y		p11
481da177e4SLinus Torvalds#define p_n		p12
491da177e4SLinus Torvalds#define p_yy		p13
501da177e4SLinus Torvalds#define p_nn		p14
511da177e4SLinus Torvalds
521da177e4SLinus Torvalds#define MIN1		15
531da177e4SLinus Torvalds#define MIN1P1HALF	8
541da177e4SLinus Torvalds#define LINE_SIZE	128
551da177e4SLinus Torvalds#define LSIZE_SH        7			// shift amount
561da177e4SLinus Torvalds#define PREF_AHEAD	8
571da177e4SLinus Torvalds
581da177e4SLinus TorvaldsGLOBAL_ENTRY(memset)
591da177e4SLinus Torvalds{ .mmi
601da177e4SLinus Torvalds	.prologue
611da177e4SLinus Torvalds	alloc	tmp = ar.pfs, 3, 0, 0, 0
621da177e4SLinus Torvalds	lfetch.nt1 [dest]			//
631da177e4SLinus Torvalds	.save   ar.lc, save_lc
641da177e4SLinus Torvalds	mov.i	save_lc = ar.lc
659df6f705SDavid Mosberger-Tang	.body
661da177e4SLinus Torvalds} { .mmi
671da177e4SLinus Torvalds	mov	ret0 = dest			// return value
681da177e4SLinus Torvalds	cmp.ne	p_nz, p_zr = value, r0		// use stf.spill if value is zero
691da177e4SLinus Torvalds	cmp.eq	p_scr, p0 = cnt, r0
701da177e4SLinus Torvalds;; }
711da177e4SLinus Torvalds{ .mmi
721da177e4SLinus Torvalds	and	ptr2 = -(MIN1+1), dest		// aligned address
731da177e4SLinus Torvalds	and	tmp = MIN1, dest		// prepare to check for correct alignment
741da177e4SLinus Torvalds	tbit.nz p_y, p_n = dest, 0		// Do we have an odd address? (M_B_U)
751da177e4SLinus Torvalds} { .mib
761da177e4SLinus Torvalds	mov	ptr1 = dest
771da177e4SLinus Torvalds	mux1	value = value, @brcst		// create 8 identical bytes in word
781da177e4SLinus Torvalds(p_scr)	br.ret.dpnt.many rp			// return immediately if count = 0
791da177e4SLinus Torvalds;; }
801da177e4SLinus Torvalds{ .mib
811da177e4SLinus Torvalds	cmp.ne	p_unalgn, p0 = tmp, r0		//
821da177e4SLinus Torvalds} { .mib
831da177e4SLinus Torvalds	sub	bytecnt = (MIN1+1), tmp		// NB: # of bytes to move is 1 higher than loopcnt
841da177e4SLinus Torvalds	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
851da177e4SLinus Torvalds(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
861da177e4SLinus Torvalds;; }
871da177e4SLinus Torvalds{ .mmi
881da177e4SLinus Torvalds(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
891da177e4SLinus Torvalds(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
901da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
911da177e4SLinus Torvalds;; }
921da177e4SLinus Torvalds{ .mib
931da177e4SLinus Torvalds(p_y)	add	cnt = -8, cnt			//
941da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
951da177e4SLinus Torvalds} { .mib
961da177e4SLinus Torvalds(p_y)	st8	[ptr2] = value,-4		//
971da177e4SLinus Torvalds(p_n)	add	ptr2 = 4, ptr2			//
981da177e4SLinus Torvalds;; }
991da177e4SLinus Torvalds{ .mib
1001da177e4SLinus Torvalds(p_yy)	add	cnt = -4, cnt			//
1011da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
1021da177e4SLinus Torvalds} { .mib
1031da177e4SLinus Torvalds(p_yy)	st4	[ptr2] = value,-2		//
1041da177e4SLinus Torvalds(p_nn)	add	ptr2 = 2, ptr2			//
1051da177e4SLinus Torvalds;; }
1061da177e4SLinus Torvalds{ .mmi
1071da177e4SLinus Torvalds	mov	tmp = LINE_SIZE+1		// for compare
1081da177e4SLinus Torvalds(p_y)	add	cnt = -2, cnt			//
1091da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
1101da177e4SLinus Torvalds} { .mmi
1111da177e4SLinus Torvalds	setf.sig fvalue=value			// transfer value to FLP side
1121da177e4SLinus Torvalds(p_y)	st2	[ptr2] = value,-1		//
1131da177e4SLinus Torvalds(p_n)	add	ptr2 = 1, ptr2			//
1141da177e4SLinus Torvalds;; }
1151da177e4SLinus Torvalds
1161da177e4SLinus Torvalds{ .mmi
1171da177e4SLinus Torvalds(p_yy)	st1	[ptr2] = value 			//
1181da177e4SLinus Torvalds  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
1191da177e4SLinus Torvalds} { .mbb
1201da177e4SLinus Torvalds(p_yy)	add	cnt = -1, cnt			//
1211da177e4SLinus Torvalds(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
1221da177e4SLinus Torvalds;; }
1231da177e4SLinus Torvalds
1241da177e4SLinus Torvalds{ .mib
1251da177e4SLinus Torvalds	nop.m 0
1261da177e4SLinus Torvalds	shr.u	linecnt = cnt, LSIZE_SH
1271da177e4SLinus Torvalds(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
1281da177e4SLinus Torvalds;; }
1291da177e4SLinus Torvalds
1301da177e4SLinus Torvalds	TEXT_ALIGN(32) // --------------------- //  L1A: store ahead into cache lines; fill later
1311da177e4SLinus Torvalds{ .mmi
1321da177e4SLinus Torvalds	and	tmp = -(LINE_SIZE), cnt		// compute end of range
1331da177e4SLinus Torvalds	mov	ptr9 = ptr1			// used for prefetching
1341da177e4SLinus Torvalds	and	cnt = (LINE_SIZE-1), cnt	// remainder
1351da177e4SLinus Torvalds} { .mmi
1361da177e4SLinus Torvalds	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
1371da177e4SLinus Torvalds	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
1381da177e4SLinus Torvalds;; }
1391da177e4SLinus Torvalds{ .mmi
1401da177e4SLinus Torvalds(p_scr)	add	loopcnt = -1, linecnt		//
1411da177e4SLinus Torvalds	add	ptr2 = 8, ptr1			// start of stores (beyond prefetch stores)
1421da177e4SLinus Torvalds	add	ptr1 = tmp, ptr1		// first address beyond total range
1431da177e4SLinus Torvalds;; }
1441da177e4SLinus Torvalds{ .mmi
1451da177e4SLinus Torvalds	add	tmp = -1, linecnt		// next loop count
1461da177e4SLinus Torvalds	mov.i	ar.lc = loopcnt			//
1471da177e4SLinus Torvalds;; }
1481da177e4SLinus Torvalds.pref_l1a:
1491da177e4SLinus Torvalds{ .mib
1501da177e4SLinus Torvalds	stf8 [ptr9] = fvalue, 128		// Do stores one cache line apart
1511da177e4SLinus Torvalds	nop.i	0
1521da177e4SLinus Torvalds	br.cloop.dptk.few .pref_l1a
1531da177e4SLinus Torvalds;; }
1541da177e4SLinus Torvalds{ .mmi
1551da177e4SLinus Torvalds	add	ptr0 = 16, ptr2			// Two stores in parallel
1561da177e4SLinus Torvalds	mov.i	ar.lc = tmp			//
1571da177e4SLinus Torvalds;; }
1581da177e4SLinus Torvalds.l1ax:
1591da177e4SLinus Torvalds { .mmi
1601da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 8
1611da177e4SLinus Torvalds	stf8 [ptr0] = fvalue, 8
1621da177e4SLinus Torvalds ;; }
1631da177e4SLinus Torvalds { .mmi
1641da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 24
1651da177e4SLinus Torvalds	stf8 [ptr0] = fvalue, 24
1661da177e4SLinus Torvalds ;; }
1671da177e4SLinus Torvalds { .mmi
1681da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 8
1691da177e4SLinus Torvalds	stf8 [ptr0] = fvalue, 8
1701da177e4SLinus Torvalds ;; }
1711da177e4SLinus Torvalds { .mmi
1721da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 24
1731da177e4SLinus Torvalds	stf8 [ptr0] = fvalue, 24
1741da177e4SLinus Torvalds ;; }
1751da177e4SLinus Torvalds { .mmi
1761da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 8
1771da177e4SLinus Torvalds	stf8 [ptr0] = fvalue, 8
1781da177e4SLinus Torvalds ;; }
1791da177e4SLinus Torvalds { .mmi
1801da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 24
1811da177e4SLinus Torvalds	stf8 [ptr0] = fvalue, 24
1821da177e4SLinus Torvalds ;; }
1831da177e4SLinus Torvalds { .mmi
1841da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 8
1851da177e4SLinus Torvalds	stf8 [ptr0] = fvalue, 32
1861da177e4SLinus Torvalds 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
1871da177e4SLinus Torvalds ;; }
1881da177e4SLinus Torvalds{ .mmb
1891da177e4SLinus Torvalds	stf8 [ptr2] = fvalue, 24
1901da177e4SLinus Torvalds(p_scr)	stf8 [ptr9] = fvalue, 128
1911da177e4SLinus Torvalds	br.cloop.dptk.few .l1ax
1921da177e4SLinus Torvalds;; }
1931da177e4SLinus Torvalds{ .mbb
1941da177e4SLinus Torvalds	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
1951da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
1961da177e4SLinus Torvalds	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
1971da177e4SLinus Torvalds;; }
1981da177e4SLinus Torvalds
1991da177e4SLinus Torvalds	TEXT_ALIGN(32)
2001da177e4SLinus Torvalds.l1b:	// ------------------------------------ //  L1B: store ahead into cache lines; fill later
2011da177e4SLinus Torvalds{ .mmi
2021da177e4SLinus Torvalds	and	tmp = -(LINE_SIZE), cnt		// compute end of range
2031da177e4SLinus Torvalds	mov	ptr9 = ptr1			// used for prefetching
2041da177e4SLinus Torvalds	and	cnt = (LINE_SIZE-1), cnt	// remainder
2051da177e4SLinus Torvalds} { .mmi
2061da177e4SLinus Torvalds	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
2071da177e4SLinus Torvalds	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
2081da177e4SLinus Torvalds;; }
2091da177e4SLinus Torvalds{ .mmi
2101da177e4SLinus Torvalds(p_scr)	add	loopcnt = -1, linecnt
2111da177e4SLinus Torvalds	add	ptr2 = 16, ptr1			// start of stores (beyond prefetch stores)
2121da177e4SLinus Torvalds	add	ptr1 = tmp, ptr1		// first address beyond total range
2131da177e4SLinus Torvalds;; }
2141da177e4SLinus Torvalds{ .mmi
2151da177e4SLinus Torvalds	add	tmp = -1, linecnt		// next loop count
2161da177e4SLinus Torvalds	mov.i	ar.lc = loopcnt
2171da177e4SLinus Torvalds;; }
2181da177e4SLinus Torvalds.pref_l1b:
2191da177e4SLinus Torvalds{ .mib
2201da177e4SLinus Torvalds	stf.spill [ptr9] = f0, 128		// Do stores one cache line apart
2211da177e4SLinus Torvalds	nop.i   0
2221da177e4SLinus Torvalds	br.cloop.dptk.few .pref_l1b
2231da177e4SLinus Torvalds;; }
2241da177e4SLinus Torvalds{ .mmi
2251da177e4SLinus Torvalds	add	ptr0 = 16, ptr2			// Two stores in parallel
2261da177e4SLinus Torvalds	mov.i	ar.lc = tmp
2271da177e4SLinus Torvalds;; }
2281da177e4SLinus Torvalds.l1bx:
2291da177e4SLinus Torvalds { .mmi
2301da177e4SLinus Torvalds	stf.spill [ptr2] = f0, 32
2311da177e4SLinus Torvalds	stf.spill [ptr0] = f0, 32
2321da177e4SLinus Torvalds ;; }
2331da177e4SLinus Torvalds { .mmi
2341da177e4SLinus Torvalds	stf.spill [ptr2] = f0, 32
2351da177e4SLinus Torvalds	stf.spill [ptr0] = f0, 32
2361da177e4SLinus Torvalds ;; }
2371da177e4SLinus Torvalds { .mmi
2381da177e4SLinus Torvalds	stf.spill [ptr2] = f0, 32
2391da177e4SLinus Torvalds	stf.spill [ptr0] = f0, 64
2401da177e4SLinus Torvalds 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
2411da177e4SLinus Torvalds ;; }
2421da177e4SLinus Torvalds{ .mmb
2431da177e4SLinus Torvalds	stf.spill [ptr2] = f0, 32
2441da177e4SLinus Torvalds(p_scr)	stf.spill [ptr9] = f0, 128
2451da177e4SLinus Torvalds	br.cloop.dptk.few .l1bx
2461da177e4SLinus Torvalds;; }
2471da177e4SLinus Torvalds{ .mib
2481da177e4SLinus Torvalds	cmp.gt  p_scr, p0 = 8, cnt		// just a few bytes left ?
2491da177e4SLinus Torvalds(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment	//
2501da177e4SLinus Torvalds;; }
2511da177e4SLinus Torvalds
2521da177e4SLinus Torvalds.fraction_of_line:
2531da177e4SLinus Torvalds{ .mib
2541da177e4SLinus Torvalds	add	ptr2 = 16, ptr1
2551da177e4SLinus Torvalds	shr.u	loopcnt = cnt, 5   		// loopcnt = cnt / 32
2561da177e4SLinus Torvalds;; }
2571da177e4SLinus Torvalds{ .mib
2581da177e4SLinus Torvalds	cmp.eq	p_scr, p0 = loopcnt, r0
2591da177e4SLinus Torvalds	add	loopcnt = -1, loopcnt
2601da177e4SLinus Torvalds(p_scr)	br.cond.dpnt.many .store_words
2611da177e4SLinus Torvalds;; }
2621da177e4SLinus Torvalds{ .mib
2631da177e4SLinus Torvalds	and	cnt = 0x1f, cnt			// compute the remaining cnt
2641da177e4SLinus Torvalds	mov.i   ar.lc = loopcnt
2651da177e4SLinus Torvalds;; }
2661da177e4SLinus Torvalds	TEXT_ALIGN(32)
2671da177e4SLinus Torvalds.l2:	// ------------------------------------ //  L2A:  store 32B in 2 cycles
2681da177e4SLinus Torvalds{ .mmb
2691da177e4SLinus Torvalds	stf8	[ptr1] = fvalue, 8
2701da177e4SLinus Torvalds	stf8	[ptr2] = fvalue, 8
2711da177e4SLinus Torvalds;; } { .mmb
2721da177e4SLinus Torvalds	stf8	[ptr1] = fvalue, 24
2731da177e4SLinus Torvalds	stf8	[ptr2] = fvalue, 24
2741da177e4SLinus Torvalds	br.cloop.dptk.many .l2
2751da177e4SLinus Torvalds;; }
2761da177e4SLinus Torvalds.store_words:
2771da177e4SLinus Torvalds{ .mib
2781da177e4SLinus Torvalds	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
2791da177e4SLinus Torvalds(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
2801da177e4SLinus Torvalds;; }
2811da177e4SLinus Torvalds
2821da177e4SLinus Torvalds{ .mmi
2831da177e4SLinus Torvalds	stf8	[ptr1] = fvalue, 8		// store
2841da177e4SLinus Torvalds	cmp.le	p_y, p_n = 16, cnt
2851da177e4SLinus Torvalds	add	cnt = -8, cnt			// subtract
2861da177e4SLinus Torvalds;; }
2871da177e4SLinus Torvalds{ .mmi
2881da177e4SLinus Torvalds(p_y)	stf8	[ptr1] = fvalue, 8		// store
2891da177e4SLinus Torvalds(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
2901da177e4SLinus Torvalds(p_y)	add	cnt = -8, cnt			// subtract
2911da177e4SLinus Torvalds;; }
2921da177e4SLinus Torvalds{ .mmi						// store
2931da177e4SLinus Torvalds(p_yy)	stf8	[ptr1] = fvalue, 8
2941da177e4SLinus Torvalds(p_yy)	add	cnt = -8, cnt			// subtract
2951da177e4SLinus Torvalds;; }
2961da177e4SLinus Torvalds
2971da177e4SLinus Torvalds.move_bytes_from_alignment:
2981da177e4SLinus Torvalds{ .mib
2991da177e4SLinus Torvalds	cmp.eq	p_scr, p0 = cnt, r0
3001da177e4SLinus Torvalds	tbit.nz.unc p_y, p0 = cnt, 2		// should we terminate with a st4 ?
3011da177e4SLinus Torvalds(p_scr)	br.cond.dpnt.few .restore_and_exit
3021da177e4SLinus Torvalds;; }
3031da177e4SLinus Torvalds{ .mib
3041da177e4SLinus Torvalds(p_y)	st4	[ptr1] = value,4
3051da177e4SLinus Torvalds	tbit.nz.unc p_yy, p0 = cnt, 1		// should we terminate with a st2 ?
3061da177e4SLinus Torvalds;; }
3071da177e4SLinus Torvalds{ .mib
3081da177e4SLinus Torvalds(p_yy)	st2	[ptr1] = value,2
3091da177e4SLinus Torvalds	tbit.nz.unc p_y, p0 = cnt, 0		// should we terminate with a st1 ?
3101da177e4SLinus Torvalds;; }
3111da177e4SLinus Torvalds
3121da177e4SLinus Torvalds{ .mib
3131da177e4SLinus Torvalds(p_y)	st1	[ptr1] = value
3141da177e4SLinus Torvalds;; }
3151da177e4SLinus Torvalds.restore_and_exit:
3161da177e4SLinus Torvalds{ .mib
3171da177e4SLinus Torvalds	nop.m	0
3181da177e4SLinus Torvalds	mov.i	ar.lc = save_lc
3191da177e4SLinus Torvalds	br.ret.sptk.many rp
3201da177e4SLinus Torvalds;; }
3211da177e4SLinus Torvalds
3221da177e4SLinus Torvalds.move_bytes_unaligned:
3231da177e4SLinus Torvalds{ .mmi
3241da177e4SLinus Torvalds       .pred.rel "mutex",p_y, p_n
3251da177e4SLinus Torvalds       .pred.rel "mutex",p_yy, p_nn
3261da177e4SLinus Torvalds(p_n)	cmp.le  p_yy, p_nn = 4, cnt
3271da177e4SLinus Torvalds(p_y)	cmp.le  p_yy, p_nn = 5, cnt
3281da177e4SLinus Torvalds(p_n)	add	ptr2 = 2, ptr1
3291da177e4SLinus Torvalds} { .mmi
3301da177e4SLinus Torvalds(p_y)	add	ptr2 = 3, ptr1
3311da177e4SLinus Torvalds(p_y)	st1	[ptr1] = value, 1		// fill 1 (odd-aligned) byte [15, 14 (or less) left]
3321da177e4SLinus Torvalds(p_y)	add	cnt = -1, cnt
3331da177e4SLinus Torvalds;; }
3341da177e4SLinus Torvalds{ .mmi
3351da177e4SLinus Torvalds(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
3361da177e4SLinus Torvalds	add	ptr3 = ptr1, cnt		// prepare last store
3371da177e4SLinus Torvalds	mov.i	ar.lc = save_lc
3381da177e4SLinus Torvalds} { .mmi
3391da177e4SLinus Torvalds(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
3401da177e4SLinus Torvalds(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [11, 10 (o less) left]
3411da177e4SLinus Torvalds(p_yy)	add	cnt = -4, cnt
3421da177e4SLinus Torvalds;; }
3431da177e4SLinus Torvalds{ .mmi
3441da177e4SLinus Torvalds(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
3451da177e4SLinus Torvalds	add	ptr3 = -1, ptr3			// last store
3461da177e4SLinus Torvalds	tbit.nz p_scr, p0 = cnt, 1		// will there be a st2 at the end ?
3471da177e4SLinus Torvalds} { .mmi
3481da177e4SLinus Torvalds(p_y)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
3491da177e4SLinus Torvalds(p_y)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [7, 6 (or less) left]
3501da177e4SLinus Torvalds(p_y)	add	cnt = -4, cnt
3511da177e4SLinus Torvalds;; }
3521da177e4SLinus Torvalds{ .mmi
3531da177e4SLinus Torvalds(p_yy)	st2	[ptr1] = value, 4		// fill 2 (aligned) bytes
3541da177e4SLinus Torvalds(p_yy)	st2	[ptr2] = value, 4		// fill 2 (aligned) bytes [3, 2 (or less) left]
3551da177e4SLinus Torvalds	tbit.nz p_y, p0 = cnt, 0		// will there be a st1 at the end ?
3561da177e4SLinus Torvalds} { .mmi
3571da177e4SLinus Torvalds(p_yy)	add	cnt = -4, cnt
3581da177e4SLinus Torvalds;; }
3591da177e4SLinus Torvalds{ .mmb
3601da177e4SLinus Torvalds(p_scr)	st2	[ptr1] = value			// fill 2 (aligned) bytes
3611da177e4SLinus Torvalds(p_y)	st1	[ptr3] = value			// fill last byte (using ptr3)
3621da177e4SLinus Torvalds	br.ret.sptk.many rp
3631da177e4SLinus Torvalds}
3641da177e4SLinus TorvaldsEND(memset)
365e007c533SAl ViroEXPORT_SYMBOL(memset)
366