xref: /openbmc/linux/arch/xtensa/lib/memcopy.S (revision c819e2cf)
1/*
2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
4 *
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License.  See the file "COPYING" in the main directory of this archive
7 * for more details.
8 *
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
10 */
11
12#include <variant/core.h>
13
14	.macro	src_b	r, w0, w1
15#ifdef __XTENSA_EB__
16	src	\r, \w0, \w1
17#else
18	src	\r, \w1, \w0
19#endif
20	.endm
21
22	.macro	ssa8	r
23#ifdef __XTENSA_EB__
24	ssa8b	\r
25#else
26	ssa8l	\r
27#endif
28	.endm
29
30/*
31 * void *memcpy(void *dst, const void *src, size_t len);
32 *
33 * This function is intended to do the same thing as the standard
34 * library function memcpy() for most cases.
35 * However, where the source and/or destination references
36 * an instruction RAM or ROM or a data RAM or ROM, that
37 * source and/or destination will always be accessed with
38 * 32-bit load and store instructions (as required for these
39 * types of devices).
40 *
41 * !!!!!!!  XTFIXME:
42 * !!!!!!!  Handling of IRAM/IROM has not yet
43 * !!!!!!!  been implemented.
44 *
45 * The (general case) algorithm is as follows:
46 *   If destination is unaligned, align it by conditionally
47 *     copying 1 and 2 bytes.
48 *   If source is aligned,
49 *     do 16 bytes with a loop, and then finish up with
50 *     8, 4, 2, and 1 byte copies conditional on the length;
51 *   else (if source is unaligned),
52 *     do the same, but use SRC to align the source data.
53 *   This code tries to use fall-through branches for the common
54 *     case of aligned source and destination and multiple
55 *     of 4 (or 8) length.
56 *
57 * Register use:
58 *	a0/ return address
59 *	a1/ stack pointer
60 *	a2/ return value
61 *	a3/ src
62 *	a4/ length
63 *	a5/ dst
64 *	a6/ tmp
65 *	a7/ tmp
66 *	a8/ tmp
67 *	a9/ tmp
68 *	a10/ tmp
69 *	a11/ tmp
70 */
71
72	.text
73
74/*
75 * Byte by byte copy
76 */
77	.align	4
78	.byte	0		# 1 mod 4 alignment for LOOPNEZ
79				# (0 mod 4 alignment for LBEG)
80.Lbytecopy:
81#if XCHAL_HAVE_LOOPS
82	loopnez	a4, .Lbytecopydone
83#else /* !XCHAL_HAVE_LOOPS */
84	beqz	a4, .Lbytecopydone
85	add	a7, a3, a4	# a7 = end address for source
86#endif /* !XCHAL_HAVE_LOOPS */
87.Lnextbyte:
88	l8ui	a6, a3, 0
89	addi	a3, a3, 1
90	s8i	a6, a5, 0
91	addi	a5, a5, 1
92#if !XCHAL_HAVE_LOOPS
93	bne	a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
94#endif /* !XCHAL_HAVE_LOOPS */
95.Lbytecopydone:
96	retw
97
98/*
99 * Destination is unaligned
100 */
101
102	.align	4
103.Ldst1mod2:	# dst is only byte aligned
104	_bltui	a4, 7, .Lbytecopy	# do short copies byte by byte
105
106	# copy 1 byte
107	l8ui	a6, a3,  0
108	addi	a3, a3,  1
109	addi	a4, a4, -1
110	s8i	a6, a5,  0
111	addi	a5, a5,  1
112	_bbci.l	a5, 1, .Ldstaligned	# if dst is now aligned, then
113					# return to main algorithm
114.Ldst2mod4:	# dst 16-bit aligned
115	# copy 2 bytes
116	_bltui	a4, 6, .Lbytecopy	# do short copies byte by byte
117	l8ui	a6, a3,  0
118	l8ui	a7, a3,  1
119	addi	a3, a3,  2
120	addi	a4, a4, -2
121	s8i	a6, a5,  0
122	s8i	a7, a5,  1
123	addi	a5, a5,  2
124	j	.Ldstaligned	# dst is now aligned, return to main algorithm
125
126	.align	4
127	.global	memcpy
128	.type   memcpy,@function
129memcpy:
130
131	entry	sp, 16		# minimal stack frame
132	# a2/ dst, a3/ src, a4/ len
133	mov	a5, a2		# copy dst so that a2 is return value
134.Lcommon:
135	_bbsi.l	a2, 0, .Ldst1mod2	# if dst is 1 mod 2
136	_bbsi.l	a2, 1, .Ldst2mod4	# if dst is 2 mod 4
137.Ldstaligned:	# return here from .Ldst?mod? once dst is aligned
138	srli	a7, a4, 4	# number of loop iterations with 16B
139				# per iteration
140	movi	a8, 3		# if source is not aligned,
141	_bany	a3, a8, .Lsrcunaligned	# then use shifting copy
142	/*
143	 * Destination and source are word-aligned, use word copy.
144	 */
145	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
146#if XCHAL_HAVE_LOOPS
147	loopnez	a7, .Loop1done
148#else /* !XCHAL_HAVE_LOOPS */
149	beqz	a7, .Loop1done
150	slli	a8, a7, 4
151	add	a8, a8, a3	# a8 = end of last 16B source chunk
152#endif /* !XCHAL_HAVE_LOOPS */
153.Loop1:
154	l32i	a6, a3,  0
155	l32i	a7, a3,  4
156	s32i	a6, a5,  0
157	l32i	a6, a3,  8
158	s32i	a7, a5,  4
159	l32i	a7, a3, 12
160	s32i	a6, a5,  8
161	addi	a3, a3, 16
162	s32i	a7, a5, 12
163	addi	a5, a5, 16
164#if !XCHAL_HAVE_LOOPS
165	bne	a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
166#endif /* !XCHAL_HAVE_LOOPS */
167.Loop1done:
168	bbci.l	a4, 3, .L2
169	# copy 8 bytes
170	l32i	a6, a3,  0
171	l32i	a7, a3,  4
172	addi	a3, a3,  8
173	s32i	a6, a5,  0
174	s32i	a7, a5,  4
175	addi	a5, a5,  8
176.L2:
177	bbsi.l	a4, 2, .L3
178	bbsi.l	a4, 1, .L4
179	bbsi.l	a4, 0, .L5
180	retw
181.L3:
182	# copy 4 bytes
183	l32i	a6, a3,  0
184	addi	a3, a3,  4
185	s32i	a6, a5,  0
186	addi	a5, a5,  4
187	bbsi.l	a4, 1, .L4
188	bbsi.l	a4, 0, .L5
189	retw
190.L4:
191	# copy 2 bytes
192	l16ui	a6, a3,  0
193	addi	a3, a3,  2
194	s16i	a6, a5,  0
195	addi	a5, a5,  2
196	bbsi.l	a4, 0, .L5
197	retw
198.L5:
199	# copy 1 byte
200	l8ui	a6, a3,  0
201	s8i	a6, a5,  0
202	retw
203
204/*
205 * Destination is aligned, Source is unaligned
206 */
207
208	.align	4
209.Lsrcunaligned:
210	_beqz	a4, .Ldone	# avoid loading anything for zero-length copies
211	# copy 16 bytes per iteration for word-aligned dst and unaligned src
212	ssa8	a3		# set shift amount from byte offset
213
214/* set to 1 when running on ISS (simulator) with the
215   lint or ferret client, or 0 to save a few cycles */
216#define SIM_CHECKS_ALIGNMENT	1
217#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
218	and	a11, a3, a8	# save unalignment offset for below
219	sub	a3, a3, a11	# align a3
220#endif
221	l32i	a6, a3, 0	# load first word
222#if XCHAL_HAVE_LOOPS
223	loopnez	a7, .Loop2done
224#else /* !XCHAL_HAVE_LOOPS */
225	beqz	a7, .Loop2done
226	slli	a10, a7, 4
227	add	a10, a10, a3	# a10 = end of last 16B source chunk
228#endif /* !XCHAL_HAVE_LOOPS */
229.Loop2:
230	l32i	a7, a3,  4
231	l32i	a8, a3,  8
232	src_b	a6, a6, a7
233	s32i	a6, a5,  0
234	l32i	a9, a3, 12
235	src_b	a7, a7, a8
236	s32i	a7, a5,  4
237	l32i	a6, a3, 16
238	src_b	a8, a8, a9
239	s32i	a8, a5,  8
240	addi	a3, a3, 16
241	src_b	a9, a9, a6
242	s32i	a9, a5, 12
243	addi	a5, a5, 16
244#if !XCHAL_HAVE_LOOPS
245	bne	a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
246#endif /* !XCHAL_HAVE_LOOPS */
247.Loop2done:
248	bbci.l	a4, 3, .L12
249	# copy 8 bytes
250	l32i	a7, a3,  4
251	l32i	a8, a3,  8
252	src_b	a6, a6, a7
253	s32i	a6, a5,  0
254	addi	a3, a3,  8
255	src_b	a7, a7, a8
256	s32i	a7, a5,  4
257	addi	a5, a5,  8
258	mov	a6, a8
259.L12:
260	bbci.l	a4, 2, .L13
261	# copy 4 bytes
262	l32i	a7, a3,  4
263	addi	a3, a3,  4
264	src_b	a6, a6, a7
265	s32i	a6, a5,  0
266	addi	a5, a5,  4
267	mov	a6, a7
268.L13:
269#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
270	add	a3, a3, a11	# readjust a3 with correct misalignment
271#endif
272	bbsi.l	a4, 1, .L14
273	bbsi.l	a4, 0, .L15
274.Ldone:	retw
275.L14:
276	# copy 2 bytes
277	l8ui	a6, a3,  0
278	l8ui	a7, a3,  1
279	addi	a3, a3,  2
280	s8i	a6, a5,  0
281	s8i	a7, a5,  1
282	addi	a5, a5,  2
283	bbsi.l	a4, 0, .L15
284	retw
285.L15:
286	# copy 1 byte
287	l8ui	a6, a3,  0
288	s8i	a6, a5,  0
289	retw
290
291
292/*
293 * void bcopy(const void *src, void *dest, size_t n);
294 */
295	.align	4
296	.global	bcopy
297	.type   bcopy,@function
298bcopy:
299	entry	sp, 16		# minimal stack frame
300	# a2=src, a3=dst, a4=len
301	mov	a5, a3
302	mov	a3, a2
303	mov	a2, a5
304	j	.Lmovecommon	# go to common code for memmove+bcopy
305
306/*
307 * void *memmove(void *dst, const void *src, size_t len);
308 *
309 * This function is intended to do the same thing as the standard
310 * library function memmove() for most cases.
311 * However, where the source and/or destination references
312 * an instruction RAM or ROM or a data RAM or ROM, that
313 * source and/or destination will always be accessed with
314 * 32-bit load and store instructions (as required for these
315 * types of devices).
316 *
317 * !!!!!!!  XTFIXME:
318 * !!!!!!!  Handling of IRAM/IROM has not yet
319 * !!!!!!!  been implemented.
320 *
321 * The (general case) algorithm is as follows:
322 *   If end of source doesn't overlap destination then use memcpy.
323 *   Otherwise do memcpy backwards.
324 *
325 * Register use:
326 *	a0/ return address
327 *	a1/ stack pointer
328 *	a2/ return value
329 *	a3/ src
330 *	a4/ length
331 *	a5/ dst
332 *	a6/ tmp
333 *	a7/ tmp
334 *	a8/ tmp
335 *	a9/ tmp
336 *	a10/ tmp
337 *	a11/ tmp
338 */
339
340/*
341 * Byte by byte copy
342 */
343	.align	4
344	.byte	0		# 1 mod 4 alignment for LOOPNEZ
345				# (0 mod 4 alignment for LBEG)
346.Lbackbytecopy:
347#if XCHAL_HAVE_LOOPS
348	loopnez	a4, .Lbackbytecopydone
349#else /* !XCHAL_HAVE_LOOPS */
350	beqz	a4, .Lbackbytecopydone
351	sub	a7, a3, a4	# a7 = start address for source
352#endif /* !XCHAL_HAVE_LOOPS */
353.Lbacknextbyte:
354	addi	a3, a3, -1
355	l8ui	a6, a3, 0
356	addi	a5, a5, -1
357	s8i	a6, a5, 0
358#if !XCHAL_HAVE_LOOPS
359	bne	a3, a7, .Lbacknextbyte # continue loop if
360				       # $a3:src != $a7:src_start
361#endif /* !XCHAL_HAVE_LOOPS */
362.Lbackbytecopydone:
363	retw
364
365/*
366 * Destination is unaligned
367 */
368
369	.align	4
370.Lbackdst1mod2:	# dst is only byte aligned
371	_bltui	a4, 7, .Lbackbytecopy	# do short copies byte by byte
372
373	# copy 1 byte
374	addi	a3, a3, -1
375	l8ui	a6, a3,  0
376	addi	a5, a5, -1
377	s8i	a6, a5,  0
378	addi	a4, a4, -1
379	_bbci.l	a5, 1, .Lbackdstaligned	# if dst is now aligned, then
380					# return to main algorithm
381.Lbackdst2mod4:	# dst 16-bit aligned
382	# copy 2 bytes
383	_bltui	a4, 6, .Lbackbytecopy	# do short copies byte by byte
384	addi	a3, a3, -2
385	l8ui	a6, a3,  0
386	l8ui	a7, a3,  1
387	addi	a5, a5, -2
388	s8i	a6, a5,  0
389	s8i	a7, a5,  1
390	addi	a4, a4, -2
391	j	.Lbackdstaligned	# dst is now aligned,
392					# return to main algorithm
393
394	.align	4
395	.global	memmove
396	.type   memmove,@function
397memmove:
398
399	entry	sp, 16		# minimal stack frame
400	# a2/ dst, a3/ src, a4/ len
401	mov	a5, a2		# copy dst so that a2 is return value
402.Lmovecommon:
403	sub	a6, a5, a3
404	bgeu	a6, a4, .Lcommon
405
406	add	a5, a5, a4
407	add	a3, a3, a4
408
409	_bbsi.l	a5, 0, .Lbackdst1mod2	# if dst is 1 mod 2
410	_bbsi.l	a5, 1, .Lbackdst2mod4	# if dst is 2 mod 4
411.Lbackdstaligned:	# return here from .Lbackdst?mod? once dst is aligned
412	srli	a7, a4, 4	# number of loop iterations with 16B
413				# per iteration
414	movi	a8, 3		# if source is not aligned,
415	_bany	a3, a8, .Lbacksrcunaligned	# then use shifting copy
416	/*
417	 * Destination and source are word-aligned, use word copy.
418	 */
419	# copy 16 bytes per iteration for word-aligned dst and word-aligned src
420#if XCHAL_HAVE_LOOPS
421	loopnez	a7, .backLoop1done
422#else /* !XCHAL_HAVE_LOOPS */
423	beqz	a7, .backLoop1done
424	slli	a8, a7, 4
425	sub	a8, a3, a8	# a8 = start of first 16B source chunk
426#endif /* !XCHAL_HAVE_LOOPS */
427.backLoop1:
428	addi	a3, a3, -16
429	l32i	a7, a3, 12
430	l32i	a6, a3,  8
431	addi	a5, a5, -16
432	s32i	a7, a5, 12
433	l32i	a7, a3,  4
434	s32i	a6, a5,  8
435	l32i	a6, a3,  0
436	s32i	a7, a5,  4
437	s32i	a6, a5,  0
438#if !XCHAL_HAVE_LOOPS
439	bne	a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
440#endif /* !XCHAL_HAVE_LOOPS */
441.backLoop1done:
442	bbci.l	a4, 3, .Lback2
443	# copy 8 bytes
444	addi	a3, a3, -8
445	l32i	a6, a3,  0
446	l32i	a7, a3,  4
447	addi	a5, a5, -8
448	s32i	a6, a5,  0
449	s32i	a7, a5,  4
450.Lback2:
451	bbsi.l	a4, 2, .Lback3
452	bbsi.l	a4, 1, .Lback4
453	bbsi.l	a4, 0, .Lback5
454	retw
455.Lback3:
456	# copy 4 bytes
457	addi	a3, a3, -4
458	l32i	a6, a3,  0
459	addi	a5, a5, -4
460	s32i	a6, a5,  0
461	bbsi.l	a4, 1, .Lback4
462	bbsi.l	a4, 0, .Lback5
463	retw
464.Lback4:
465	# copy 2 bytes
466	addi	a3, a3, -2
467	l16ui	a6, a3,  0
468	addi	a5, a5, -2
469	s16i	a6, a5,  0
470	bbsi.l	a4, 0, .Lback5
471	retw
472.Lback5:
473	# copy 1 byte
474	addi	a3, a3, -1
475	l8ui	a6, a3,  0
476	addi	a5, a5, -1
477	s8i	a6, a5,  0
478	retw
479
480/*
481 * Destination is aligned, Source is unaligned
482 */
483
484	.align	4
485.Lbacksrcunaligned:
486	_beqz	a4, .Lbackdone	# avoid loading anything for zero-length copies
487	# copy 16 bytes per iteration for word-aligned dst and unaligned src
488	ssa8	a3		# set shift amount from byte offset
489#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS with
490					 * the lint or ferret client, or 0
491					 * to save a few cycles */
492#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
493	and	a11, a3, a8	# save unalignment offset for below
494	sub	a3, a3, a11	# align a3
495#endif
496	l32i	a6, a3, 0	# load first word
497#if XCHAL_HAVE_LOOPS
498	loopnez	a7, .backLoop2done
499#else /* !XCHAL_HAVE_LOOPS */
500	beqz	a7, .backLoop2done
501	slli	a10, a7, 4
502	sub	a10, a3, a10	# a10 = start of first 16B source chunk
503#endif /* !XCHAL_HAVE_LOOPS */
504.backLoop2:
505	addi	a3, a3, -16
506	l32i	a7, a3, 12
507	l32i	a8, a3,  8
508	addi	a5, a5, -16
509	src_b	a6, a7, a6
510	s32i	a6, a5, 12
511	l32i	a9, a3,  4
512	src_b	a7, a8, a7
513	s32i	a7, a5,  8
514	l32i	a6, a3,  0
515	src_b	a8, a9, a8
516	s32i	a8, a5,  4
517	src_b	a9, a6, a9
518	s32i	a9, a5,  0
519#if !XCHAL_HAVE_LOOPS
520	bne	a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
521#endif /* !XCHAL_HAVE_LOOPS */
522.backLoop2done:
523	bbci.l	a4, 3, .Lback12
524	# copy 8 bytes
525	addi	a3, a3, -8
526	l32i	a7, a3,  4
527	l32i	a8, a3,  0
528	addi	a5, a5, -8
529	src_b	a6, a7, a6
530	s32i	a6, a5,  4
531	src_b	a7, a8, a7
532	s32i	a7, a5,  0
533	mov	a6, a8
534.Lback12:
535	bbci.l	a4, 2, .Lback13
536	# copy 4 bytes
537	addi	a3, a3, -4
538	l32i	a7, a3,  0
539	addi	a5, a5, -4
540	src_b	a6, a7, a6
541	s32i	a6, a5,  0
542	mov	a6, a7
543.Lback13:
544#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
545	add	a3, a3, a11	# readjust a3 with correct misalignment
546#endif
547	bbsi.l	a4, 1, .Lback14
548	bbsi.l	a4, 0, .Lback15
549.Lbackdone:
550	retw
551.Lback14:
552	# copy 2 bytes
553	addi	a3, a3, -2
554	l8ui	a6, a3,  0
555	l8ui	a7, a3,  1
556	addi	a5, a5, -2
557	s8i	a6, a5,  0
558	s8i	a7, a5,  1
559	bbsi.l	a4, 0, .Lback15
560	retw
561.Lback15:
562	# copy 1 byte
563	addi	a3, a3, -1
564	addi	a5, a5, -1
565	l8ui	a6, a3,  0
566	s8i	a6, a5,  0
567	retw
568
569
570/*
571 * Local Variables:
572 * mode:fundamental
573 * comment-start: "# "
574 * comment-start-skip: "# *"
575 * End:
576 */
577