xref: /openbmc/linux/arch/hexagon/lib/memset.S (revision 05cf4fe738242183f1237f1b3a28b4479348c0a1)
1/*
2 * Copyright (c) 2011, The Linux Foundation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 and
6 * only version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA.
17 */
18
19
20/* HEXAGON assembly optimized memset */
21/* Replaces the standard library function memset */
22
23
24        .macro HEXAGON_OPT_FUNC_BEGIN name
25	.text
26	.p2align 4
27	.globl \name
28	.type  \name, @function
29\name:
30	.endm
31
32	.macro HEXAGON_OPT_FUNC_FINISH name
33	.size  \name, . - \name
34	.endm
35
36/* FUNCTION: memset (v2 version) */
37#if __HEXAGON_ARCH__ < 3
38HEXAGON_OPT_FUNC_BEGIN memset
39	{
40		r6 = #8
41		r7 = extractu(r0, #3 , #0)
42		p0 = cmp.eq(r2, #0)
43		p1 = cmp.gtu(r2, #7)
44	}
45	{
46		r4 = vsplatb(r1)
47		r8 = r0           /* leave r0 intact for return val  */
48		r9 = sub(r6, r7)  /* bytes until double alignment  */
49		if p0 jumpr r31   /* count == 0, so return  */
50	}
51	{
52		r3 = #0
53		r7 = #0
54		p0 = tstbit(r9, #0)
55		if p1 jump 2f /* skip byte loop */
56	}
57
58/* less than 8 bytes to set, so just set a byte at a time and return  */
59
60		loop0(1f, r2) /* byte loop */
61	.falign
621: /* byte loop */
63	{
64		memb(r8++#1) = r4
65	}:endloop0
66		jumpr r31
67	.falign
682: /* skip byte loop */
69	{
70		r6 = #1
71		p0 = tstbit(r9, #1)
72		p1 = cmp.eq(r2, #1)
73		if !p0 jump 3f /* skip initial byte store */
74	}
75	{
76		memb(r8++#1) = r4
77		r3:2 = sub(r3:2, r7:6)
78		if p1 jumpr r31
79	}
80	.falign
813: /* skip initial byte store */
82	{
83		r6 = #2
84		p0 = tstbit(r9, #2)
85		p1 = cmp.eq(r2, #2)
86		if !p0 jump 4f /* skip initial half store */
87	}
88	{
89		memh(r8++#2) = r4
90		r3:2 = sub(r3:2, r7:6)
91		if p1 jumpr r31
92	}
93	.falign
944: /* skip initial half store */
95	{
96		r6 = #4
97		p0 = cmp.gtu(r2, #7)
98		p1 = cmp.eq(r2, #4)
99		if !p0 jump 5f /* skip initial word store */
100	}
101	{
102		memw(r8++#4) = r4
103		r3:2 = sub(r3:2, r7:6)
104		p0 = cmp.gtu(r2, #11)
105		if p1 jumpr r31
106	}
107	.falign
1085: /* skip initial word store */
109	{
110		r10 = lsr(r2, #3)
111		p1 = cmp.eq(r3, #1)
112		if !p0 jump 7f /* skip double loop */
113	}
114	{
115		r5 = r4
116		r6 = #8
117		loop0(6f, r10) /* double loop */
118	}
119
120/* set bytes a double word at a time  */
121
122	.falign
1236: /* double loop */
124	{
125		memd(r8++#8) = r5:4
126		r3:2 = sub(r3:2, r7:6)
127		p1 = cmp.eq(r2, #8)
128	}:endloop0
129	.falign
1307: /* skip double loop */
131	{
132		p0 = tstbit(r2, #2)
133		if p1 jumpr r31
134	}
135	{
136		r6 = #4
137		p0 = tstbit(r2, #1)
138		p1 = cmp.eq(r2, #4)
139		if !p0 jump 8f /* skip final word store */
140	}
141	{
142		memw(r8++#4) = r4
143		r3:2 = sub(r3:2, r7:6)
144		if p1 jumpr r31
145	}
146	.falign
1478: /* skip final word store */
148	{
149		p1 = cmp.eq(r2, #2)
150		if !p0 jump 9f /* skip final half store */
151	}
152	{
153		memh(r8++#2) = r4
154		if p1 jumpr r31
155	}
156	.falign
1579: /* skip final half store */
158	{
159		memb(r8++#1) = r4
160		jumpr r31
161	}
162HEXAGON_OPT_FUNC_FINISH memset
163#endif
164
165
166/*  FUNCTION: memset (v3 and higher version)  */
167#if __HEXAGON_ARCH__ >= 3
168HEXAGON_OPT_FUNC_BEGIN memset
169	{
170		r7=vsplatb(r1)
171		r6 = r0
172		if (r2==#0) jump:nt .L1
173	}
174	{
175		r5:4=combine(r7,r7)
176		p0 = cmp.gtu(r2,#8)
177		if (p0.new) jump:nt .L3
178	}
179	{
180		r3 = r0
181		loop0(.L47,r2)
182	}
183	.falign
184.L47:
185	{
186		memb(r3++#1) = r1
187	}:endloop0 /* start=.L47 */
188		jumpr r31
189.L3:
190	{
191		p0 = tstbit(r0,#0)
192		if (!p0.new) jump:nt .L8
193		p1 = cmp.eq(r2, #1)
194	}
195	{
196		r6 = add(r0, #1)
197		r2 = add(r2,#-1)
198		memb(r0) = r1
199		if (p1) jump .L1
200	}
201.L8:
202	{
203		p0 = tstbit(r6,#1)
204		if (!p0.new) jump:nt .L10
205	}
206	{
207		r2 = add(r2,#-2)
208		memh(r6++#2) = r7
209		p0 = cmp.eq(r2, #2)
210		if (p0.new) jump:nt .L1
211	}
212.L10:
213	{
214		p0 = tstbit(r6,#2)
215		if (!p0.new) jump:nt .L12
216	}
217	{
218		r2 = add(r2,#-4)
219		memw(r6++#4) = r7
220		p0 = cmp.eq(r2, #4)
221		if (p0.new) jump:nt .L1
222	}
223.L12:
224	{
225		p0 = cmp.gtu(r2,#127)
226		if (!p0.new) jump:nt .L14
227	}
228		r3 = and(r6,#31)
229		if (r3==#0) jump:nt .L17
230	{
231		memd(r6++#8) = r5:4
232		r2 = add(r2,#-8)
233	}
234		r3 = and(r6,#31)
235		if (r3==#0) jump:nt .L17
236	{
237		memd(r6++#8) = r5:4
238		r2 = add(r2,#-8)
239	}
240		r3 = and(r6,#31)
241		if (r3==#0) jump:nt .L17
242	{
243		memd(r6++#8) = r5:4
244		r2 = add(r2,#-8)
245	}
246.L17:
247	{
248		r3 = lsr(r2,#5)
249		if (r1!=#0) jump:nt .L18
250	}
251	{
252		r8 = r3
253		r3 = r6
254		loop0(.L46,r3)
255	}
256	.falign
257.L46:
258	{
259		dczeroa(r6)
260		r6 = add(r6,#32)
261		r2 = add(r2,#-32)
262	}:endloop0 /* start=.L46 */
263.L14:
264	{
265		p0 = cmp.gtu(r2,#7)
266		if (!p0.new) jump:nt .L28
267		r8 = lsr(r2,#3)
268	}
269		loop0(.L44,r8)
270	.falign
271.L44:
272	{
273		memd(r6++#8) = r5:4
274		r2 = add(r2,#-8)
275	}:endloop0 /* start=.L44 */
276.L28:
277	{
278		p0 = tstbit(r2,#2)
279		if (!p0.new) jump:nt .L33
280	}
281	{
282		r2 = add(r2,#-4)
283		memw(r6++#4) = r7
284	}
285.L33:
286	{
287		p0 = tstbit(r2,#1)
288		if (!p0.new) jump:nt .L35
289	}
290	{
291		r2 = add(r2,#-2)
292		memh(r6++#2) = r7
293	}
294.L35:
295		p0 = cmp.eq(r2,#1)
296		if (p0) memb(r6) = r1
297.L1:
298		jumpr r31
299.L18:
300		loop0(.L45,r3)
301	.falign
302.L45:
303		dczeroa(r6)
304	{
305		memd(r6++#8) = r5:4
306		r2 = add(r2,#-32)
307	}
308		memd(r6++#8) = r5:4
309		memd(r6++#8) = r5:4
310	{
311		memd(r6++#8) = r5:4
312	}:endloop0 /* start=.L45  */
313		jump .L14
314HEXAGON_OPT_FUNC_FINISH memset
315#endif
316