xref: /openbmc/linux/arch/mips/lib/memcpy.S (revision 64c70b1c)
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License.  See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Unified implementation of memcpy, memmove and the __copy_user backend.
7 *
8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc.
11 *   memcpy/copy_user author: Mark Vandevoorde
12 *
13 * Mnemonic names for arguments to memcpy/__copy_user
14 */
15
16/*
17 * Hack to resolve longstanding prefetch issue
18 *
19 * Prefetching may be fatal on some systems if we're prefetching beyond the
20 * end of memory on some systems.  It's also a seriously bad idea on non
21 * dma-coherent systems.
22 */
23#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
24#undef CONFIG_CPU_HAS_PREFETCH
25#endif
26#ifdef CONFIG_MIPS_MALTA
27#undef CONFIG_CPU_HAS_PREFETCH
28#endif
29
30#include <asm/asm.h>
31#include <asm/asm-offsets.h>
32#include <asm/regdef.h>
33
34#define dst a0
35#define src a1
36#define len a2
37
38/*
39 * Spec
40 *
41 * memcpy copies len bytes from src to dst and sets v0 to dst.
42 * It assumes that
43 *   - src and dst don't overlap
44 *   - src is readable
45 *   - dst is writable
46 * memcpy uses the standard calling convention
47 *
48 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
49 * the number of uncopied bytes due to an exception caused by a read or write.
50 * __copy_user assumes that src and dst don't overlap, and that the call is
51 * implementing one of the following:
52 *   copy_to_user
53 *     - src is readable  (no exceptions when reading src)
54 *   copy_from_user
55 *     - dst is writable  (no exceptions when writing dst)
56 * __copy_user uses a non-standard calling convention; see
57 * include/asm-mips/uaccess.h
58 *
59 * When an exception happens on a load, the handler must
60 # ensure that all of the destination buffer is overwritten to prevent
61 * leaking information to user mode programs.
62 */
63
64/*
65 * Implementation
66 */
67
68/*
69 * The exception handler for loads requires that:
70 *  1- AT contain the address of the byte just past the end of the source
71 *     of the copy,
72 *  2- src_entry <= src < AT, and
73 *  3- (dst - src) == (dst_entry - src_entry),
74 * The _entry suffix denotes values when __copy_user was called.
75 *
76 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
77 * (2) is met by incrementing src by the number of bytes copied
78 * (3) is met by not doing loads between a pair of increments of dst and src
79 *
80 * The exception handlers for stores adjust len (if necessary) and return.
81 * These handlers do not need to overwrite any data.
82 *
83 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
84 * they're not protected.
85 */
86
87#define EXC(inst_reg,addr,handler)		\
889:	inst_reg, addr;				\
89	.section __ex_table,"a";		\
90	PTR	9b, handler;			\
91	.previous
92
93/*
94 * Only on the 64-bit kernel we can made use of 64-bit registers.
95 */
96#ifdef CONFIG_64BIT
97#define USE_DOUBLE
98#endif
99
100#ifdef USE_DOUBLE
101
102#define LOAD   ld
103#define LOADL  ldl
104#define LOADR  ldr
105#define STOREL sdl
106#define STORER sdr
107#define STORE  sd
108#define ADD    daddu
109#define SUB    dsubu
110#define SRL    dsrl
111#define SRA    dsra
112#define SLL    dsll
113#define SLLV   dsllv
114#define SRLV   dsrlv
115#define NBYTES 8
116#define LOG_NBYTES 3
117
118/*
119 * As we are sharing code base with the mips32 tree (which use the o32 ABI
120 * register definitions). We need to redefine the register definitions from
121 * the n64 ABI register naming to the o32 ABI register naming.
122 */
123#undef t0
124#undef t1
125#undef t2
126#undef t3
127#define t0	$8
128#define t1	$9
129#define t2	$10
130#define t3	$11
131#define t4	$12
132#define t5	$13
133#define t6	$14
134#define t7	$15
135
136#else
137
138#define LOAD   lw
139#define LOADL  lwl
140#define LOADR  lwr
141#define STOREL swl
142#define STORER swr
143#define STORE  sw
144#define ADD    addu
145#define SUB    subu
146#define SRL    srl
147#define SLL    sll
148#define SRA    sra
149#define SLLV   sllv
150#define SRLV   srlv
151#define NBYTES 4
152#define LOG_NBYTES 2
153
154#endif /* USE_DOUBLE */
155
156#ifdef CONFIG_CPU_LITTLE_ENDIAN
157#define LDFIRST LOADR
158#define LDREST  LOADL
159#define STFIRST STORER
160#define STREST  STOREL
161#define SHIFT_DISCARD SLLV
162#else
163#define LDFIRST LOADL
164#define LDREST  LOADR
165#define STFIRST STOREL
166#define STREST  STORER
167#define SHIFT_DISCARD SRLV
168#endif
169
170#define FIRST(unit) ((unit)*NBYTES)
171#define REST(unit)  (FIRST(unit)+NBYTES-1)
172#define UNIT(unit)  FIRST(unit)
173
174#define ADDRMASK (NBYTES-1)
175
176	.text
177	.set	noreorder
178	.set	noat
179
180/*
181 * A combined memcpy/__copy_user
182 * __copy_user sets len to 0 for success; else to an upper bound of
183 * the number of uncopied bytes.
184 * memcpy sets v0 to dst.
185 */
186	.align	5
187LEAF(memcpy)					/* a0=dst a1=src a2=len */
188	move	v0, dst				/* return value */
189__memcpy:
190FEXPORT(__copy_user)
191	/*
192	 * Note: dst & src may be unaligned, len may be 0
193	 * Temps
194	 */
195#define rem t8
196
197	/*
198	 * The "issue break"s below are very approximate.
199	 * Issue delays for dcache fills will perturb the schedule, as will
200	 * load queue full replay traps, etc.
201	 *
202	 * If len < NBYTES use byte operations.
203	 */
204	PREF(	0, 0(src) )
205	PREF(	1, 0(dst) )
206	sltu	t2, len, NBYTES
207	and	t1, dst, ADDRMASK
208	PREF(	0, 1*32(src) )
209	PREF(	1, 1*32(dst) )
210	bnez	t2, copy_bytes_checklen
211	 and	t0, src, ADDRMASK
212	PREF(	0, 2*32(src) )
213	PREF(	1, 2*32(dst) )
214	bnez	t1, dst_unaligned
215	 nop
216	bnez	t0, src_unaligned_dst_aligned
217	/*
218	 * use delay slot for fall-through
219	 * src and dst are aligned; need to compute rem
220	 */
221both_aligned:
222	 SRL	t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
223	beqz	t0, cleanup_both_aligned # len < 8*NBYTES
224	 and	rem, len, (8*NBYTES-1)	 # rem = len % (8*NBYTES)
225	PREF(	0, 3*32(src) )
226	PREF(	1, 3*32(dst) )
227	.align	4
2281:
229EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
230EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
231EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
232EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
233	SUB	len, len, 8*NBYTES
234EXC(	LOAD	t4, UNIT(4)(src),	l_exc_copy)
235EXC(	LOAD	t7, UNIT(5)(src),	l_exc_copy)
236EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p8u)
237EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p7u)
238EXC(	LOAD	t0, UNIT(6)(src),	l_exc_copy)
239EXC(	LOAD	t1, UNIT(7)(src),	l_exc_copy)
240	ADD	src, src, 8*NBYTES
241	ADD	dst, dst, 8*NBYTES
242EXC(	STORE	t2, UNIT(-6)(dst),	s_exc_p6u)
243EXC(	STORE	t3, UNIT(-5)(dst),	s_exc_p5u)
244EXC(	STORE	t4, UNIT(-4)(dst),	s_exc_p4u)
245EXC(	STORE	t7, UNIT(-3)(dst),	s_exc_p3u)
246EXC(	STORE	t0, UNIT(-2)(dst),	s_exc_p2u)
247EXC(	STORE	t1, UNIT(-1)(dst),	s_exc_p1u)
248	PREF(	0, 8*32(src) )
249	PREF(	1, 8*32(dst) )
250	bne	len, rem, 1b
251	 nop
252
253	/*
254	 * len == rem == the number of bytes left to copy < 8*NBYTES
255	 */
256cleanup_both_aligned:
257	beqz	len, done
258	 sltu	t0, len, 4*NBYTES
259	bnez	t0, less_than_4units
260	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
261	/*
262	 * len >= 4*NBYTES
263	 */
264EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
265EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
266EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
267EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
268	SUB	len, len, 4*NBYTES
269	ADD	src, src, 4*NBYTES
270EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
271EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
272EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
273EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
274	beqz	len, done
275	 ADD	dst, dst, 4*NBYTES
276less_than_4units:
277	/*
278	 * rem = len % NBYTES
279	 */
280	beq	rem, len, copy_bytes
281	 nop
2821:
283EXC(	LOAD	t0, 0(src),		l_exc)
284	ADD	src, src, NBYTES
285	SUB	len, len, NBYTES
286EXC(	STORE	t0, 0(dst),		s_exc_p1u)
287	bne	rem, len, 1b
288	 ADD	dst, dst, NBYTES
289
290	/*
291	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
292	 * A loop would do only a byte at a time with possible branch
293	 * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
294	 * because can't assume read-access to dst.  Instead, use
295	 * STREST dst, which doesn't require read access to dst.
296	 *
297	 * This code should perform better than a simple loop on modern,
298	 * wide-issue mips processors because the code has fewer branches and
299	 * more instruction-level parallelism.
300	 */
301#define bits t2
302	beqz	len, done
303	 ADD	t1, dst, len	# t1 is just past last byte of dst
304	li	bits, 8*NBYTES
305	SLL	rem, len, 3	# rem = number of bits to keep
306EXC(	LOAD	t0, 0(src),		l_exc)
307	SUB	bits, bits, rem	# bits = number of bits to discard
308	SHIFT_DISCARD t0, t0, bits
309EXC(	STREST	t0, -1(t1),		s_exc)
310	jr	ra
311	 move	len, zero
312dst_unaligned:
313	/*
314	 * dst is unaligned
315	 * t0 = src & ADDRMASK
316	 * t1 = dst & ADDRMASK; T1 > 0
317	 * len >= NBYTES
318	 *
319	 * Copy enough bytes to align dst
320	 * Set match = (src and dst have same alignment)
321	 */
322#define match rem
323EXC(	LDFIRST	t3, FIRST(0)(src),	l_exc)
324	ADD	t2, zero, NBYTES
325EXC(	LDREST	t3, REST(0)(src),	l_exc_copy)
326	SUB	t2, t2, t1	# t2 = number of bytes copied
327	xor	match, t0, t1
328EXC(	STFIRST t3, FIRST(0)(dst),	s_exc)
329	beq	len, t2, done
330	 SUB	len, len, t2
331	ADD	dst, dst, t2
332	beqz	match, both_aligned
333	 ADD	src, src, t2
334
335src_unaligned_dst_aligned:
336	SRL	t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
337	PREF(	0, 3*32(src) )
338	beqz	t0, cleanup_src_unaligned
339	 and	rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
340	PREF(	1, 3*32(dst) )
3411:
342/*
343 * Avoid consecutive LD*'s to the same register since some mips
344 * implementations can't issue them in the same cycle.
345 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
346 * are to the same unit (unless src is aligned, but it's not).
347 */
348EXC(	LDFIRST	t0, FIRST(0)(src),	l_exc)
349EXC(	LDFIRST	t1, FIRST(1)(src),	l_exc_copy)
350	SUB     len, len, 4*NBYTES
351EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
352EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
353EXC(	LDFIRST	t2, FIRST(2)(src),	l_exc_copy)
354EXC(	LDFIRST	t3, FIRST(3)(src),	l_exc_copy)
355EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
356EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
357	PREF(	0, 9*32(src) )		# 0 is PREF_LOAD  (not streamed)
358	ADD	src, src, 4*NBYTES
359#ifdef CONFIG_CPU_SB1
360	nop				# improves slotting
361#endif
362EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
363EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
364EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
365EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
366	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
367	bne	len, rem, 1b
368	 ADD	dst, dst, 4*NBYTES
369
370cleanup_src_unaligned:
371	beqz	len, done
372	 and	rem, len, NBYTES-1  # rem = len % NBYTES
373	beq	rem, len, copy_bytes
374	 nop
3751:
376EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
377EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
378	ADD	src, src, NBYTES
379	SUB	len, len, NBYTES
380EXC(	STORE	t0, 0(dst),		s_exc_p1u)
381	bne	len, rem, 1b
382	 ADD	dst, dst, NBYTES
383
384copy_bytes_checklen:
385	beqz	len, done
386	 nop
387copy_bytes:
388	/* 0 < len < NBYTES  */
389#define COPY_BYTE(N)			\
390EXC(	lb	t0, N(src), l_exc);	\
391	SUB	len, len, 1;		\
392	beqz	len, done;		\
393EXC(	 sb	t0, N(dst), s_exc_p1)
394
395	COPY_BYTE(0)
396	COPY_BYTE(1)
397#ifdef USE_DOUBLE
398	COPY_BYTE(2)
399	COPY_BYTE(3)
400	COPY_BYTE(4)
401	COPY_BYTE(5)
402#endif
403EXC(	lb	t0, NBYTES-2(src), l_exc)
404	SUB	len, len, 1
405	jr	ra
406EXC(	 sb	t0, NBYTES-2(dst), s_exc_p1)
407done:
408	jr	ra
409	 nop
410	END(memcpy)
411
412l_exc_copy:
413	/*
414	 * Copy bytes from src until faulting load address (or until a
415	 * lb faults)
416	 *
417	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
418	 * may be more than a byte beyond the last address.
419	 * Hence, the lb below may get an exception.
420	 *
421	 * Assumes src < THREAD_BUADDR($28)
422	 */
423	LOAD	t0, TI_TASK($28)
424	 nop
425	LOAD	t0, THREAD_BUADDR(t0)
4261:
427EXC(	lb	t1, 0(src),	l_exc)
428	ADD	src, src, 1
429	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
430	bne	src, t0, 1b
431	 ADD	dst, dst, 1
432l_exc:
433	LOAD	t0, TI_TASK($28)
434	 nop
435	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
436	 nop
437	SUB	len, AT, t0		# len number of uncopied bytes
438	/*
439	 * Here's where we rely on src and dst being incremented in tandem,
440	 *   See (3) above.
441	 * dst += (fault addr - src) to put dst at first byte to clear
442	 */
443	ADD	dst, t0			# compute start address in a1
444	SUB	dst, src
445	/*
446	 * Clear len bytes starting at dst.  Can't call __bzero because it
447	 * might modify len.  An inefficient loop for these rare times...
448	 */
449	beqz	len, done
450	 SUB	src, len, 1
4511:	sb	zero, 0(dst)
452	ADD	dst, dst, 1
453	bnez	src, 1b
454	 SUB	src, src, 1
455	jr	ra
456	 nop
457
458
459#define SEXC(n)				\
460s_exc_p ## n ## u:			\
461	jr	ra;			\
462	 ADD	len, len, n*NBYTES
463
464SEXC(8)
465SEXC(7)
466SEXC(6)
467SEXC(5)
468SEXC(4)
469SEXC(3)
470SEXC(2)
471SEXC(1)
472
473s_exc_p1:
474	jr	ra
475	 ADD	len, len, 1
476s_exc:
477	jr	ra
478	 nop
479
480	.align	5
481LEAF(memmove)
482	ADD	t0, a0, a2
483	ADD	t1, a1, a2
484	sltu	t0, a1, t0			# dst + len <= src -> memcpy
485	sltu	t1, a0, t1			# dst >= src + len -> memcpy
486	and	t0, t1
487	beqz	t0, __memcpy
488	 move	v0, a0				/* return value */
489	beqz	a2, r_out
490	END(memmove)
491
492	/* fall through to __rmemcpy */
493LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
494	 sltu	t0, a1, a0
495	beqz	t0, r_end_bytes_up		# src >= dst
496	 nop
497	ADD	a0, a2				# dst = dst + len
498	ADD	a1, a2				# src = src + len
499
500r_end_bytes:
501	lb	t0, -1(a1)
502	SUB	a2, a2, 0x1
503	sb	t0, -1(a0)
504	SUB	a1, a1, 0x1
505	bnez	a2, r_end_bytes
506	 SUB	a0, a0, 0x1
507
508r_out:
509	jr	ra
510	 move	a2, zero
511
512r_end_bytes_up:
513	lb	t0, (a1)
514	SUB	a2, a2, 0x1
515	sb	t0, (a0)
516	ADD	a1, a1, 0x1
517	bnez	a2, r_end_bytes_up
518	 ADD	a0, a0, 0x1
519
520	jr	ra
521	 move	a2, zero
522	END(__rmemcpy)
523