xref: /openbmc/linux/arch/xtensa/lib/memcopy.S (revision 338d9150)
1249ac17eSChris Zankel/*
2249ac17eSChris Zankel * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3249ac17eSChris Zankel * xthal_memcpy and xthal_bcopy
4249ac17eSChris Zankel *
5249ac17eSChris Zankel * This file is subject to the terms and conditions of the GNU General Public
6249ac17eSChris Zankel * License.  See the file "COPYING" in the main directory of this archive
7249ac17eSChris Zankel * for more details.
8249ac17eSChris Zankel *
9eae8a416SChris Zankel * Copyright (C) 2002 - 2012 Tensilica Inc.
10249ac17eSChris Zankel */
11249ac17eSChris Zankel
125cf97ebdSMax Filippov#include <linux/linkage.h>
13fbb871e2SMax Filippov#include <asm/asmmacro.h>
148f8d5745SMax Filippov#include <asm/core.h>
15249ac17eSChris Zankel
16249ac17eSChris Zankel/*
17249ac17eSChris Zankel * void *memcpy(void *dst, const void *src, size_t len);
18249ac17eSChris Zankel *
19249ac17eSChris Zankel * This function is intended to do the same thing as the standard
20eae8a416SChris Zankel * library function memcpy() for most cases.
21249ac17eSChris Zankel * However, where the source and/or destination references
22249ac17eSChris Zankel * an instruction RAM or ROM or a data RAM or ROM, that
23249ac17eSChris Zankel * source and/or destination will always be accessed with
24249ac17eSChris Zankel * 32-bit load and store instructions (as required for these
25249ac17eSChris Zankel * types of devices).
26249ac17eSChris Zankel *
27249ac17eSChris Zankel * !!!!!!!  XTFIXME:
28249ac17eSChris Zankel * !!!!!!!  Handling of IRAM/IROM has not yet
29249ac17eSChris Zankel * !!!!!!!  been implemented.
30249ac17eSChris Zankel *
31249ac17eSChris Zankel * The (general case) algorithm is as follows:
32249ac17eSChris Zankel *   If destination is unaligned, align it by conditionally
33249ac17eSChris Zankel *     copying 1 and 2 bytes.
34249ac17eSChris Zankel *   If source is aligned,
35249ac17eSChris Zankel *     do 16 bytes with a loop, and then finish up with
36249ac17eSChris Zankel *     8, 4, 2, and 1 byte copies conditional on the length;
37249ac17eSChris Zankel *   else (if source is unaligned),
38249ac17eSChris Zankel *     do the same, but use SRC to align the source data.
39249ac17eSChris Zankel *   This code tries to use fall-through branches for the common
40249ac17eSChris Zankel *     case of aligned source and destination and multiple
41249ac17eSChris Zankel *     of 4 (or 8) length.
42249ac17eSChris Zankel *
43249ac17eSChris Zankel * Register use:
44249ac17eSChris Zankel *	a0/ return address
45249ac17eSChris Zankel *	a1/ stack pointer
46249ac17eSChris Zankel *	a2/ return value
47249ac17eSChris Zankel *	a3/ src
48249ac17eSChris Zankel *	a4/ length
49249ac17eSChris Zankel *	a5/ dst
50249ac17eSChris Zankel *	a6/ tmp
51249ac17eSChris Zankel *	a7/ tmp
52249ac17eSChris Zankel *	a8/ tmp
53249ac17eSChris Zankel *	a9/ tmp
54249ac17eSChris Zankel *	a10/ tmp
55249ac17eSChris Zankel *	a11/ tmp
56249ac17eSChris Zankel */
57249ac17eSChris Zankel
58249ac17eSChris Zankel	.text
59249ac17eSChris Zankel
60249ac17eSChris Zankel/*
61249ac17eSChris Zankel * Byte by byte copy
62249ac17eSChris Zankel */
63249ac17eSChris Zankel	.align	4
64249ac17eSChris Zankel	.byte	0		# 1 mod 4 alignment for LOOPNEZ
65249ac17eSChris Zankel				# (0 mod 4 alignment for LBEG)
66249ac17eSChris Zankel.Lbytecopy:
67249ac17eSChris Zankel#if XCHAL_HAVE_LOOPS
68249ac17eSChris Zankel	loopnez	a4, .Lbytecopydone
69249ac17eSChris Zankel#else /* !XCHAL_HAVE_LOOPS */
70249ac17eSChris Zankel	beqz	a4, .Lbytecopydone
71249ac17eSChris Zankel	add	a7, a3, a4	# a7 = end address for source
72249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
73249ac17eSChris Zankel.Lnextbyte:
74249ac17eSChris Zankel	l8ui	a6, a3, 0
75249ac17eSChris Zankel	addi	a3, a3, 1
76249ac17eSChris Zankel	s8i	a6, a5, 0
77249ac17eSChris Zankel	addi	a5, a5, 1
78249ac17eSChris Zankel#if !XCHAL_HAVE_LOOPS
79eae8a416SChris Zankel	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
81249ac17eSChris Zankel.Lbytecopydone:
82d6d5f19eSMax Filippov	abi_ret_default
83249ac17eSChris Zankel
84249ac17eSChris Zankel/*
85249ac17eSChris Zankel * Destination is unaligned
86249ac17eSChris Zankel */
87249ac17eSChris Zankel
88249ac17eSChris Zankel	.align	4
89249ac17eSChris Zankel.Ldst1mod2:	# dst is only byte aligned
90249ac17eSChris Zankel	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
91249ac17eSChris Zankel
92249ac17eSChris Zankel	# copy 1 byte
93249ac17eSChris Zankel	l8ui	a6, a3,  0
94249ac17eSChris Zankel	addi	a3, a3,  1
95249ac17eSChris Zankel	addi	a4, a4, -1
96249ac17eSChris Zankel	s8i	a6, a5,  0
97249ac17eSChris Zankel	addi	a5, a5,  1
98249ac17eSChris Zankel	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
99249ac17eSChris Zankel					# return to main algorithm
100249ac17eSChris Zankel.Ldst2mod4:	# dst 16-bit aligned
101249ac17eSChris Zankel	# copy 2 bytes
102249ac17eSChris Zankel	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
103249ac17eSChris Zankel	l8ui	a6, a3,  0
104249ac17eSChris Zankel	l8ui	a7, a3,  1
105249ac17eSChris Zankel	addi	a3, a3,  2
106249ac17eSChris Zankel	addi	a4, a4, -2
107249ac17eSChris Zankel	s8i	a6, a5,  0
108249ac17eSChris Zankel	s8i	a7, a5,  1
109249ac17eSChris Zankel	addi	a5, a5,  2
110249ac17eSChris Zankel	j	.Ldstaligned	# dst is now aligned, return to main algorithm
111249ac17eSChris Zankel
112c633544aSMax FilippovENTRY(__memcpy)
113c633544aSMax FilippovWEAK(memcpy)
114249ac17eSChris Zankel
115d6d5f19eSMax Filippov	abi_entry_default
116249ac17eSChris Zankel	# a2/ dst, a3/ src, a4/ len
117249ac17eSChris Zankel	mov	a5, a2		# copy dst so that a2 is return value
118249ac17eSChris Zankel.Lcommon:
119249ac17eSChris Zankel	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
120249ac17eSChris Zankel	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
121249ac17eSChris Zankel.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
122249ac17eSChris Zankel	srli	a7, a4, 4	# number of loop iterations with 16B
123249ac17eSChris Zankel				# per iteration
124249ac17eSChris Zankel	movi	a8, 3		# if source is not aligned,
125249ac17eSChris Zankel	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
126249ac17eSChris Zankel	/*
127249ac17eSChris Zankel	 * Destination and source are word-aligned, use word copy.
128249ac17eSChris Zankel	 */
129249ac17eSChris Zankel	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
130249ac17eSChris Zankel#if XCHAL_HAVE_LOOPS
131249ac17eSChris Zankel	loopnez	a7, .Loop1done
132249ac17eSChris Zankel#else /* !XCHAL_HAVE_LOOPS */
133249ac17eSChris Zankel	beqz	a7, .Loop1done
134249ac17eSChris Zankel	slli	a8, a7, 4
135249ac17eSChris Zankel	add	a8, a8, a3	# a8 = end of last 16B source chunk
136249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
137249ac17eSChris Zankel.Loop1:
138249ac17eSChris Zankel	l32i	a6, a3,  0
139249ac17eSChris Zankel	l32i	a7, a3,  4
140249ac17eSChris Zankel	s32i	a6, a5,  0
141249ac17eSChris Zankel	l32i	a6, a3,  8
142249ac17eSChris Zankel	s32i	a7, a5,  4
143249ac17eSChris Zankel	l32i	a7, a3, 12
144249ac17eSChris Zankel	s32i	a6, a5,  8
145249ac17eSChris Zankel	addi	a3, a3, 16
146249ac17eSChris Zankel	s32i	a7, a5, 12
147249ac17eSChris Zankel	addi	a5, a5, 16
148249ac17eSChris Zankel#if !XCHAL_HAVE_LOOPS
149eae8a416SChris Zankel	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
150249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
151249ac17eSChris Zankel.Loop1done:
152249ac17eSChris Zankel	bbci.l	a4, 3, .L2
153249ac17eSChris Zankel	# copy 8 bytes
154249ac17eSChris Zankel	l32i	a6, a3,  0
155249ac17eSChris Zankel	l32i	a7, a3,  4
156249ac17eSChris Zankel	addi	a3, a3,  8
157249ac17eSChris Zankel	s32i	a6, a5,  0
158249ac17eSChris Zankel	s32i	a7, a5,  4
159249ac17eSChris Zankel	addi	a5, a5,  8
160249ac17eSChris Zankel.L2:
161249ac17eSChris Zankel	bbsi.l	a4, 2, .L3
162249ac17eSChris Zankel	bbsi.l	a4, 1, .L4
163249ac17eSChris Zankel	bbsi.l	a4, 0, .L5
164d6d5f19eSMax Filippov	abi_ret_default
165249ac17eSChris Zankel.L3:
166249ac17eSChris Zankel	# copy 4 bytes
167249ac17eSChris Zankel	l32i	a6, a3,  0
168249ac17eSChris Zankel	addi	a3, a3,  4
169249ac17eSChris Zankel	s32i	a6, a5,  0
170249ac17eSChris Zankel	addi	a5, a5,  4
171249ac17eSChris Zankel	bbsi.l	a4, 1, .L4
172249ac17eSChris Zankel	bbsi.l	a4, 0, .L5
173d6d5f19eSMax Filippov	abi_ret_default
174249ac17eSChris Zankel.L4:
175249ac17eSChris Zankel	# copy 2 bytes
176249ac17eSChris Zankel	l16ui	a6, a3,  0
177249ac17eSChris Zankel	addi	a3, a3,  2
178249ac17eSChris Zankel	s16i	a6, a5,  0
179249ac17eSChris Zankel	addi	a5, a5,  2
180249ac17eSChris Zankel	bbsi.l	a4, 0, .L5
181d6d5f19eSMax Filippov	abi_ret_default
182249ac17eSChris Zankel.L5:
183249ac17eSChris Zankel	# copy 1 byte
184249ac17eSChris Zankel	l8ui	a6, a3,  0
185249ac17eSChris Zankel	s8i	a6, a5,  0
186d6d5f19eSMax Filippov	abi_ret_default
187249ac17eSChris Zankel
188249ac17eSChris Zankel/*
189249ac17eSChris Zankel * Destination is aligned, Source is unaligned
190249ac17eSChris Zankel */
191249ac17eSChris Zankel
192249ac17eSChris Zankel	.align	4
193249ac17eSChris Zankel.Lsrcunaligned:
194249ac17eSChris Zankel	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
195249ac17eSChris Zankel	# copy 16 bytes per iteration for word-aligned dst and unaligned src
196fbb871e2SMax Filippov	__ssa8	a3		# set shift amount from byte offset
197c4c4594bSChris Zankel
198c4c4594bSChris Zankel/* set to 1 when running on ISS (simulator) with the
199249ac17eSChris Zankel   lint or ferret client, or 0 to save a few cycles */
200c4c4594bSChris Zankel#define SIM_CHECKS_ALIGNMENT	1
201249ac17eSChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202249ac17eSChris Zankel	and	a11, a3, a8	# save unalignment offset for below
203249ac17eSChris Zankel	sub	a3, a3, a11	# align a3
204249ac17eSChris Zankel#endif
205249ac17eSChris Zankel	l32i	a6, a3, 0	# load first word
206249ac17eSChris Zankel#if XCHAL_HAVE_LOOPS
207249ac17eSChris Zankel	loopnez	a7, .Loop2done
208249ac17eSChris Zankel#else /* !XCHAL_HAVE_LOOPS */
209249ac17eSChris Zankel	beqz	a7, .Loop2done
210249ac17eSChris Zankel	slli	a10, a7, 4
211249ac17eSChris Zankel	add	a10, a10, a3	# a10 = end of last 16B source chunk
212249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
213249ac17eSChris Zankel.Loop2:
214249ac17eSChris Zankel	l32i	a7, a3,  4
215249ac17eSChris Zankel	l32i	a8, a3,  8
216fbb871e2SMax Filippov	__src_b	a6, a6, a7
217249ac17eSChris Zankel	s32i	a6, a5,  0
218249ac17eSChris Zankel	l32i	a9, a3, 12
219fbb871e2SMax Filippov	__src_b	a7, a7, a8
220249ac17eSChris Zankel	s32i	a7, a5,  4
221249ac17eSChris Zankel	l32i	a6, a3, 16
222fbb871e2SMax Filippov	__src_b	a8, a8, a9
223249ac17eSChris Zankel	s32i	a8, a5,  8
224249ac17eSChris Zankel	addi	a3, a3, 16
225fbb871e2SMax Filippov	__src_b	a9, a9, a6
226249ac17eSChris Zankel	s32i	a9, a5, 12
227249ac17eSChris Zankel	addi	a5, a5, 16
228249ac17eSChris Zankel#if !XCHAL_HAVE_LOOPS
229eae8a416SChris Zankel	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230249ac17eSChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
231249ac17eSChris Zankel.Loop2done:
232249ac17eSChris Zankel	bbci.l	a4, 3, .L12
233249ac17eSChris Zankel	# copy 8 bytes
234249ac17eSChris Zankel	l32i	a7, a3,  4
235249ac17eSChris Zankel	l32i	a8, a3,  8
236fbb871e2SMax Filippov	__src_b	a6, a6, a7
237249ac17eSChris Zankel	s32i	a6, a5,  0
238249ac17eSChris Zankel	addi	a3, a3,  8
239fbb871e2SMax Filippov	__src_b	a7, a7, a8
240249ac17eSChris Zankel	s32i	a7, a5,  4
241249ac17eSChris Zankel	addi	a5, a5,  8
242249ac17eSChris Zankel	mov	a6, a8
243249ac17eSChris Zankel.L12:
244249ac17eSChris Zankel	bbci.l	a4, 2, .L13
245249ac17eSChris Zankel	# copy 4 bytes
246249ac17eSChris Zankel	l32i	a7, a3,  4
247249ac17eSChris Zankel	addi	a3, a3,  4
248fbb871e2SMax Filippov	__src_b	a6, a6, a7
249249ac17eSChris Zankel	s32i	a6, a5,  0
250249ac17eSChris Zankel	addi	a5, a5,  4
251249ac17eSChris Zankel	mov	a6, a7
252249ac17eSChris Zankel.L13:
253249ac17eSChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254249ac17eSChris Zankel	add	a3, a3, a11	# readjust a3 with correct misalignment
255249ac17eSChris Zankel#endif
256249ac17eSChris Zankel	bbsi.l	a4, 1, .L14
257249ac17eSChris Zankel	bbsi.l	a4, 0, .L15
258d6d5f19eSMax Filippov.Ldone:	abi_ret_default
259249ac17eSChris Zankel.L14:
260249ac17eSChris Zankel	# copy 2 bytes
261249ac17eSChris Zankel	l8ui	a6, a3,  0
262249ac17eSChris Zankel	l8ui	a7, a3,  1
263249ac17eSChris Zankel	addi	a3, a3,  2
264249ac17eSChris Zankel	s8i	a6, a5,  0
265249ac17eSChris Zankel	s8i	a7, a5,  1
266249ac17eSChris Zankel	addi	a5, a5,  2
267249ac17eSChris Zankel	bbsi.l	a4, 0, .L15
268d6d5f19eSMax Filippov	abi_ret_default
269249ac17eSChris Zankel.L15:
270249ac17eSChris Zankel	# copy 1 byte
271249ac17eSChris Zankel	l8ui	a6, a3,  0
272249ac17eSChris Zankel	s8i	a6, a5,  0
273d6d5f19eSMax Filippov	abi_ret_default
274eae8a416SChris Zankel
275c633544aSMax FilippovENDPROC(__memcpy)
276*338d9150SMax FilippovEXPORT_SYMBOL(__memcpy)
277*338d9150SMax FilippovEXPORT_SYMBOL(memcpy)
278eae8a416SChris Zankel
279eae8a416SChris Zankel/*
280eae8a416SChris Zankel * void *memmove(void *dst, const void *src, size_t len);
281eae8a416SChris Zankel *
282eae8a416SChris Zankel * This function is intended to do the same thing as the standard
283eae8a416SChris Zankel * library function memmove() for most cases.
284eae8a416SChris Zankel * However, where the source and/or destination references
285eae8a416SChris Zankel * an instruction RAM or ROM or a data RAM or ROM, that
286eae8a416SChris Zankel * source and/or destination will always be accessed with
287eae8a416SChris Zankel * 32-bit load and store instructions (as required for these
288eae8a416SChris Zankel * types of devices).
289eae8a416SChris Zankel *
290eae8a416SChris Zankel * !!!!!!!  XTFIXME:
291eae8a416SChris Zankel * !!!!!!!  Handling of IRAM/IROM has not yet
292eae8a416SChris Zankel * !!!!!!!  been implemented.
293eae8a416SChris Zankel *
294eae8a416SChris Zankel * The (general case) algorithm is as follows:
295eae8a416SChris Zankel *   If end of source doesn't overlap destination then use memcpy.
296eae8a416SChris Zankel *   Otherwise do memcpy backwards.
297eae8a416SChris Zankel *
298eae8a416SChris Zankel * Register use:
299eae8a416SChris Zankel *	a0/ return address
300eae8a416SChris Zankel *	a1/ stack pointer
301eae8a416SChris Zankel *	a2/ return value
302eae8a416SChris Zankel *	a3/ src
303eae8a416SChris Zankel *	a4/ length
304eae8a416SChris Zankel *	a5/ dst
305eae8a416SChris Zankel *	a6/ tmp
306eae8a416SChris Zankel *	a7/ tmp
307eae8a416SChris Zankel *	a8/ tmp
308eae8a416SChris Zankel *	a9/ tmp
309eae8a416SChris Zankel *	a10/ tmp
310eae8a416SChris Zankel *	a11/ tmp
311eae8a416SChris Zankel */
312eae8a416SChris Zankel
313eae8a416SChris Zankel/*
314eae8a416SChris Zankel * Byte by byte copy
315eae8a416SChris Zankel */
316eae8a416SChris Zankel	.align	4
317eae8a416SChris Zankel	.byte	0		# 1 mod 4 alignment for LOOPNEZ
318eae8a416SChris Zankel				# (0 mod 4 alignment for LBEG)
319eae8a416SChris Zankel.Lbackbytecopy:
320eae8a416SChris Zankel#if XCHAL_HAVE_LOOPS
321eae8a416SChris Zankel	loopnez	a4, .Lbackbytecopydone
322eae8a416SChris Zankel#else /* !XCHAL_HAVE_LOOPS */
323eae8a416SChris Zankel	beqz	a4, .Lbackbytecopydone
324eae8a416SChris Zankel	sub	a7, a3, a4	# a7 = start address for source
325eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
326eae8a416SChris Zankel.Lbacknextbyte:
327eae8a416SChris Zankel	addi	a3, a3, -1
328eae8a416SChris Zankel	l8ui	a6, a3, 0
329eae8a416SChris Zankel	addi	a5, a5, -1
330eae8a416SChris Zankel	s8i	a6, a5, 0
331eae8a416SChris Zankel#if !XCHAL_HAVE_LOOPS
332eae8a416SChris Zankel	bne	a3, a7, .Lbacknextbyte # continue loop if
333eae8a416SChris Zankel				       # $a3:src != $a7:src_start
334eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
335eae8a416SChris Zankel.Lbackbytecopydone:
336d6d5f19eSMax Filippov	abi_ret_default
337eae8a416SChris Zankel
338eae8a416SChris Zankel/*
339eae8a416SChris Zankel * Destination is unaligned
340eae8a416SChris Zankel */
341eae8a416SChris Zankel
342eae8a416SChris Zankel	.align	4
343eae8a416SChris Zankel.Lbackdst1mod2:	# dst is only byte aligned
344eae8a416SChris Zankel	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
345eae8a416SChris Zankel
346eae8a416SChris Zankel	# copy 1 byte
347eae8a416SChris Zankel	addi	a3, a3, -1
348eae8a416SChris Zankel	l8ui	a6, a3,  0
349eae8a416SChris Zankel	addi	a5, a5, -1
350eae8a416SChris Zankel	s8i	a6, a5,  0
351eae8a416SChris Zankel	addi	a4, a4, -1
352eae8a416SChris Zankel	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
353eae8a416SChris Zankel					# return to main algorithm
354eae8a416SChris Zankel.Lbackdst2mod4:	# dst 16-bit aligned
355eae8a416SChris Zankel	# copy 2 bytes
356eae8a416SChris Zankel	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
357eae8a416SChris Zankel	addi	a3, a3, -2
358eae8a416SChris Zankel	l8ui	a6, a3,  0
359eae8a416SChris Zankel	l8ui	a7, a3,  1
360eae8a416SChris Zankel	addi	a5, a5, -2
361eae8a416SChris Zankel	s8i	a6, a5,  0
362eae8a416SChris Zankel	s8i	a7, a5,  1
363eae8a416SChris Zankel	addi	a4, a4, -2
364eae8a416SChris Zankel	j	.Lbackdstaligned	# dst is now aligned,
365eae8a416SChris Zankel					# return to main algorithm
366eae8a416SChris Zankel
367c633544aSMax FilippovENTRY(__memmove)
368c633544aSMax FilippovWEAK(memmove)
369eae8a416SChris Zankel
370d6d5f19eSMax Filippov	abi_entry_default
371eae8a416SChris Zankel	# a2/ dst, a3/ src, a4/ len
372eae8a416SChris Zankel	mov	a5, a2		# copy dst so that a2 is return value
373eae8a416SChris Zankel.Lmovecommon:
374eae8a416SChris Zankel	sub	a6, a5, a3
375eae8a416SChris Zankel	bgeu	a6, a4, .Lcommon
376eae8a416SChris Zankel
377eae8a416SChris Zankel	add	a5, a5, a4
378eae8a416SChris Zankel	add	a3, a3, a4
379eae8a416SChris Zankel
380eae8a416SChris Zankel	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
381eae8a416SChris Zankel	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
382eae8a416SChris Zankel.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
383eae8a416SChris Zankel	srli	a7, a4, 4	# number of loop iterations with 16B
384eae8a416SChris Zankel				# per iteration
385eae8a416SChris Zankel	movi	a8, 3		# if source is not aligned,
386eae8a416SChris Zankel	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
387eae8a416SChris Zankel	/*
388eae8a416SChris Zankel	 * Destination and source are word-aligned, use word copy.
389eae8a416SChris Zankel	 */
390eae8a416SChris Zankel	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
391eae8a416SChris Zankel#if XCHAL_HAVE_LOOPS
3921030c879SMax Filippov	loopnez	a7, .LbackLoop1done
393eae8a416SChris Zankel#else /* !XCHAL_HAVE_LOOPS */
3941030c879SMax Filippov	beqz	a7, .LbackLoop1done
395eae8a416SChris Zankel	slli	a8, a7, 4
396eae8a416SChris Zankel	sub	a8, a3, a8	# a8 = start of first 16B source chunk
397eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
3981030c879SMax Filippov.LbackLoop1:
399eae8a416SChris Zankel	addi	a3, a3, -16
400eae8a416SChris Zankel	l32i	a7, a3, 12
401eae8a416SChris Zankel	l32i	a6, a3,  8
402eae8a416SChris Zankel	addi	a5, a5, -16
403eae8a416SChris Zankel	s32i	a7, a5, 12
404eae8a416SChris Zankel	l32i	a7, a3,  4
405eae8a416SChris Zankel	s32i	a6, a5,  8
406eae8a416SChris Zankel	l32i	a6, a3,  0
407eae8a416SChris Zankel	s32i	a7, a5,  4
408eae8a416SChris Zankel	s32i	a6, a5,  0
409eae8a416SChris Zankel#if !XCHAL_HAVE_LOOPS
4101030c879SMax Filippov	bne	a3, a8, .LbackLoop1  # continue loop if a3:src != a8:src_start
411eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
4121030c879SMax Filippov.LbackLoop1done:
413eae8a416SChris Zankel	bbci.l	a4, 3, .Lback2
414eae8a416SChris Zankel	# copy 8 bytes
415eae8a416SChris Zankel	addi	a3, a3, -8
416eae8a416SChris Zankel	l32i	a6, a3,  0
417eae8a416SChris Zankel	l32i	a7, a3,  4
418eae8a416SChris Zankel	addi	a5, a5, -8
419eae8a416SChris Zankel	s32i	a6, a5,  0
420eae8a416SChris Zankel	s32i	a7, a5,  4
421eae8a416SChris Zankel.Lback2:
422eae8a416SChris Zankel	bbsi.l	a4, 2, .Lback3
423eae8a416SChris Zankel	bbsi.l	a4, 1, .Lback4
424eae8a416SChris Zankel	bbsi.l	a4, 0, .Lback5
425d6d5f19eSMax Filippov	abi_ret_default
426eae8a416SChris Zankel.Lback3:
427eae8a416SChris Zankel	# copy 4 bytes
428eae8a416SChris Zankel	addi	a3, a3, -4
429eae8a416SChris Zankel	l32i	a6, a3,  0
430eae8a416SChris Zankel	addi	a5, a5, -4
431eae8a416SChris Zankel	s32i	a6, a5,  0
432eae8a416SChris Zankel	bbsi.l	a4, 1, .Lback4
433eae8a416SChris Zankel	bbsi.l	a4, 0, .Lback5
434d6d5f19eSMax Filippov	abi_ret_default
435eae8a416SChris Zankel.Lback4:
436eae8a416SChris Zankel	# copy 2 bytes
437eae8a416SChris Zankel	addi	a3, a3, -2
438eae8a416SChris Zankel	l16ui	a6, a3,  0
439eae8a416SChris Zankel	addi	a5, a5, -2
440eae8a416SChris Zankel	s16i	a6, a5,  0
441eae8a416SChris Zankel	bbsi.l	a4, 0, .Lback5
442d6d5f19eSMax Filippov	abi_ret_default
443eae8a416SChris Zankel.Lback5:
444eae8a416SChris Zankel	# copy 1 byte
445eae8a416SChris Zankel	addi	a3, a3, -1
446eae8a416SChris Zankel	l8ui	a6, a3,  0
447eae8a416SChris Zankel	addi	a5, a5, -1
448eae8a416SChris Zankel	s8i	a6, a5,  0
449d6d5f19eSMax Filippov	abi_ret_default
450eae8a416SChris Zankel
451eae8a416SChris Zankel/*
452eae8a416SChris Zankel * Destination is aligned, Source is unaligned
453eae8a416SChris Zankel */
454eae8a416SChris Zankel
455eae8a416SChris Zankel	.align	4
456eae8a416SChris Zankel.Lbacksrcunaligned:
457eae8a416SChris Zankel	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
458eae8a416SChris Zankel	# copy 16 bytes per iteration for word-aligned dst and unaligned src
459fbb871e2SMax Filippov	__ssa8	a3		# set shift amount from byte offset
460eae8a416SChris Zankel#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
461eae8a416SChris Zankel					 * the lint or ferret client, or 0
462eae8a416SChris Zankel					 * to save a few cycles */
463eae8a416SChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
464eae8a416SChris Zankel	and	a11, a3, a8	# save unalignment offset for below
465eae8a416SChris Zankel	sub	a3, a3, a11	# align a3
466eae8a416SChris Zankel#endif
467eae8a416SChris Zankel	l32i	a6, a3, 0	# load first word
468eae8a416SChris Zankel#if XCHAL_HAVE_LOOPS
4691030c879SMax Filippov	loopnez	a7, .LbackLoop2done
470eae8a416SChris Zankel#else /* !XCHAL_HAVE_LOOPS */
4711030c879SMax Filippov	beqz	a7, .LbackLoop2done
472eae8a416SChris Zankel	slli	a10, a7, 4
473eae8a416SChris Zankel	sub	a10, a3, a10	# a10 = start of first 16B source chunk
474eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
4751030c879SMax Filippov.LbackLoop2:
476eae8a416SChris Zankel	addi	a3, a3, -16
477eae8a416SChris Zankel	l32i	a7, a3, 12
478eae8a416SChris Zankel	l32i	a8, a3,  8
479eae8a416SChris Zankel	addi	a5, a5, -16
480fbb871e2SMax Filippov	__src_b	a6, a7, a6
481eae8a416SChris Zankel	s32i	a6, a5, 12
482eae8a416SChris Zankel	l32i	a9, a3,  4
483fbb871e2SMax Filippov	__src_b	a7, a8, a7
484eae8a416SChris Zankel	s32i	a7, a5,  8
485eae8a416SChris Zankel	l32i	a6, a3,  0
486fbb871e2SMax Filippov	__src_b	a8, a9, a8
487eae8a416SChris Zankel	s32i	a8, a5,  4
488fbb871e2SMax Filippov	__src_b	a9, a6, a9
489eae8a416SChris Zankel	s32i	a9, a5,  0
490eae8a416SChris Zankel#if !XCHAL_HAVE_LOOPS
4911030c879SMax Filippov	bne	a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
492eae8a416SChris Zankel#endif /* !XCHAL_HAVE_LOOPS */
4931030c879SMax Filippov.LbackLoop2done:
494eae8a416SChris Zankel	bbci.l	a4, 3, .Lback12
495eae8a416SChris Zankel	# copy 8 bytes
496eae8a416SChris Zankel	addi	a3, a3, -8
497eae8a416SChris Zankel	l32i	a7, a3,  4
498eae8a416SChris Zankel	l32i	a8, a3,  0
499eae8a416SChris Zankel	addi	a5, a5, -8
500fbb871e2SMax Filippov	__src_b	a6, a7, a6
501eae8a416SChris Zankel	s32i	a6, a5,  4
502fbb871e2SMax Filippov	__src_b	a7, a8, a7
503eae8a416SChris Zankel	s32i	a7, a5,  0
504eae8a416SChris Zankel	mov	a6, a8
505eae8a416SChris Zankel.Lback12:
506eae8a416SChris Zankel	bbci.l	a4, 2, .Lback13
507eae8a416SChris Zankel	# copy 4 bytes
508eae8a416SChris Zankel	addi	a3, a3, -4
509eae8a416SChris Zankel	l32i	a7, a3,  0
510eae8a416SChris Zankel	addi	a5, a5, -4
511fbb871e2SMax Filippov	__src_b	a6, a7, a6
512eae8a416SChris Zankel	s32i	a6, a5,  0
513eae8a416SChris Zankel	mov	a6, a7
514eae8a416SChris Zankel.Lback13:
515eae8a416SChris Zankel#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
516eae8a416SChris Zankel	add	a3, a3, a11	# readjust a3 with correct misalignment
517eae8a416SChris Zankel#endif
518eae8a416SChris Zankel	bbsi.l	a4, 1, .Lback14
519eae8a416SChris Zankel	bbsi.l	a4, 0, .Lback15
520eae8a416SChris Zankel.Lbackdone:
521d6d5f19eSMax Filippov	abi_ret_default
522eae8a416SChris Zankel.Lback14:
523eae8a416SChris Zankel	# copy 2 bytes
524eae8a416SChris Zankel	addi	a3, a3, -2
525eae8a416SChris Zankel	l8ui	a6, a3,  0
526eae8a416SChris Zankel	l8ui	a7, a3,  1
527eae8a416SChris Zankel	addi	a5, a5, -2
528eae8a416SChris Zankel	s8i	a6, a5,  0
529eae8a416SChris Zankel	s8i	a7, a5,  1
530eae8a416SChris Zankel	bbsi.l	a4, 0, .Lback15
531d6d5f19eSMax Filippov	abi_ret_default
532eae8a416SChris Zankel.Lback15:
533eae8a416SChris Zankel	# copy 1 byte
534eae8a416SChris Zankel	addi	a3, a3, -1
535eae8a416SChris Zankel	addi	a5, a5, -1
536eae8a416SChris Zankel	l8ui	a6, a3,  0
537eae8a416SChris Zankel	s8i	a6, a5,  0
538d6d5f19eSMax Filippov	abi_ret_default
539eae8a416SChris Zankel
540c633544aSMax FilippovENDPROC(__memmove)
541*338d9150SMax FilippovEXPORT_SYMBOL(__memmove)
542*338d9150SMax FilippovEXPORT_SYMBOL(memmove)
543