xref: /openbmc/linux/arch/parisc/lib/lusercopy.S (revision b830f94f)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *    User Space Access Routines
4 *
5 *    Copyright (C) 2000-2002 Hewlett-Packard (John Marvin)
6 *    Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
7 *    Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
8 *    Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
9 *    Copyright (C) 2017 Helge Deller <deller@gmx.de>
10 *    Copyright (C) 2017 John David Anglin <dave.anglin@bell.net>
11 */
12
13/*
14 * These routines still have plenty of room for optimization
15 * (word & doubleword load/store, dual issue, store hints, etc.).
16 */
17
18/*
19 * The following routines assume that space register 3 (sr3) contains
20 * the space id associated with the current users address space.
21 */
22
23
24	.text
25
26#include <asm/assembly.h>
27#include <asm/errno.h>
28#include <linux/linkage.h>
29
30	/*
31	 * get_sr gets the appropriate space value into
32	 * sr1 for kernel/user space access, depending
33	 * on the flag stored in the task structure.
34	 */
35
36	.macro  get_sr
37	mfctl       %cr30,%r1
38	ldw         TI_SEGMENT(%r1),%r22
39	mfsp        %sr3,%r1
40	or,<>       %r22,%r0,%r0
41	copy        %r0,%r1
42	mtsp        %r1,%sr1
43	.endm
44
45	/*
46	 * unsigned long lclear_user(void *to, unsigned long n)
47	 *
48	 * Returns 0 for success.
49	 * otherwise, returns number of bytes not transferred.
50	 */
51
52ENTRY_CFI(lclear_user)
53	comib,=,n   0,%r25,$lclu_done
54	get_sr
55$lclu_loop:
56	addib,<>    -1,%r25,$lclu_loop
571:      stbs,ma     %r0,1(%sr1,%r26)
58
59$lclu_done:
60	bv          %r0(%r2)
61	copy        %r25,%r28
62
632:	b           $lclu_done
64	ldo         1(%r25),%r25
65
66	ASM_EXCEPTIONTABLE_ENTRY(1b,2b)
67ENDPROC_CFI(lclear_user)
68
69
70	/*
71	 * long lstrnlen_user(char *s, long n)
72	 *
73	 * Returns 0 if exception before zero byte or reaching N,
74	 *         N+1 if N would be exceeded,
75	 *         else strlen + 1 (i.e. includes zero byte).
76	 */
77
78ENTRY_CFI(lstrnlen_user)
79	comib,=     0,%r25,$lslen_nzero
80	copy	    %r26,%r24
81	get_sr
821:      ldbs,ma     1(%sr1,%r26),%r1
83$lslen_loop:
84	comib,=,n   0,%r1,$lslen_done
85	addib,<>    -1,%r25,$lslen_loop
862:      ldbs,ma     1(%sr1,%r26),%r1
87$lslen_done:
88	bv          %r0(%r2)
89	sub	    %r26,%r24,%r28
90
91$lslen_nzero:
92	b           $lslen_done
93	ldo         1(%r26),%r26 /* special case for N == 0 */
94
953:      b	    $lslen_done
96	copy        %r24,%r26    /* reset r26 so 0 is returned on fault */
97
98	ASM_EXCEPTIONTABLE_ENTRY(1b,3b)
99	ASM_EXCEPTIONTABLE_ENTRY(2b,3b)
100
101ENDPROC_CFI(lstrnlen_user)
102
103
104/*
105 * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
106 *
107 * Inputs:
108 * - sr1 already contains space of source region
109 * - sr2 already contains space of destination region
110 *
111 * Returns:
112 * - number of bytes that could not be copied.
113 *   On success, this will be zero.
114 *
115 * This code is based on a C-implementation of a copy routine written by
116 * Randolph Chung, which in turn was derived from the glibc.
117 *
118 * Several strategies are tried to try to get the best performance for various
119 * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes
120 * at a time using general registers.  Unaligned copies are handled either by
121 * aligning the destination and then using shift-and-write method, or in a few
122 * cases by falling back to a byte-at-a-time copy.
123 *
124 * Testing with various alignments and buffer sizes shows that this code is
125 * often >10x faster than a simple byte-at-a-time copy, even for strangely
126 * aligned operands. It is interesting to note that the glibc version of memcpy
127 * (written in C) is actually quite fast already. This routine is able to beat
128 * it by 30-40% for aligned copies because of the loop unrolling, but in some
129 * cases the glibc version is still slightly faster. This lends more
130 * credibility that gcc can generate very good code as long as we are careful.
131 *
132 * Possible optimizations:
133 * - add cache prefetching
134 * - try not to use the post-increment address modifiers; they may create
135 *   additional interlocks. Assumption is that those were only efficient on old
136 *   machines (pre PA8000 processors)
137 */
138
139	dst = arg0
140	src = arg1
141	len = arg2
142	end = arg3
143	t1  = r19
144	t2  = r20
145	t3  = r21
146	t4  = r22
147	srcspc = sr1
148	dstspc = sr2
149
150	t0 = r1
151	a1 = t1
152	a2 = t2
153	a3 = t3
154	a0 = t4
155
156	save_src = ret0
157	save_dst = ret1
158	save_len = r31
159
160ENTRY_CFI(pa_memcpy)
161	/* Last destination address */
162	add	dst,len,end
163
164	/* short copy with less than 16 bytes? */
165	cmpib,COND(>>=),n 15,len,.Lbyte_loop
166
167	/* same alignment? */
168	xor	src,dst,t0
169	extru	t0,31,2,t1
170	cmpib,<>,n  0,t1,.Lunaligned_copy
171
172#ifdef CONFIG_64BIT
173	/* only do 64-bit copies if we can get aligned. */
174	extru	t0,31,3,t1
175	cmpib,<>,n  0,t1,.Lalign_loop32
176
177	/* loop until we are 64-bit aligned */
178.Lalign_loop64:
179	extru	dst,31,3,t1
180	cmpib,=,n	0,t1,.Lcopy_loop_16_start
18120:	ldb,ma	1(srcspc,src),t1
18221:	stb,ma	t1,1(dstspc,dst)
183	b	.Lalign_loop64
184	ldo	-1(len),len
185
186	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
187	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
188
189.Lcopy_loop_16_start:
190	ldi	31,t0
191.Lcopy_loop_16:
192	cmpb,COND(>>=),n t0,len,.Lword_loop
193
19410:	ldd	0(srcspc,src),t1
19511:	ldd	8(srcspc,src),t2
196	ldo	16(src),src
19712:	std,ma	t1,8(dstspc,dst)
19813:	std,ma	t2,8(dstspc,dst)
19914:	ldd	0(srcspc,src),t1
20015:	ldd	8(srcspc,src),t2
201	ldo	16(src),src
20216:	std,ma	t1,8(dstspc,dst)
20317:	std,ma	t2,8(dstspc,dst)
204
205	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
206	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault)
207	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
208	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
209	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
210	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault)
211	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
212	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
213
214	b	.Lcopy_loop_16
215	ldo	-32(len),len
216
217.Lword_loop:
218	cmpib,COND(>>=),n 3,len,.Lbyte_loop
21920:	ldw,ma	4(srcspc,src),t1
22021:	stw,ma	t1,4(dstspc,dst)
221	b	.Lword_loop
222	ldo	-4(len),len
223
224	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
225	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
226
227#endif /* CONFIG_64BIT */
228
229	/* loop until we are 32-bit aligned */
230.Lalign_loop32:
231	extru	dst,31,2,t1
232	cmpib,=,n	0,t1,.Lcopy_loop_8
23320:	ldb,ma	1(srcspc,src),t1
23421:	stb,ma	t1,1(dstspc,dst)
235	b	.Lalign_loop32
236	ldo	-1(len),len
237
238	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
239	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
240
241
242.Lcopy_loop_8:
243	cmpib,COND(>>=),n 15,len,.Lbyte_loop
244
24510:	ldw	0(srcspc,src),t1
24611:	ldw	4(srcspc,src),t2
24712:	stw,ma	t1,4(dstspc,dst)
24813:	stw,ma	t2,4(dstspc,dst)
24914:	ldw	8(srcspc,src),t1
25015:	ldw	12(srcspc,src),t2
251	ldo	16(src),src
25216:	stw,ma	t1,4(dstspc,dst)
25317:	stw,ma	t2,4(dstspc,dst)
254
255	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
256	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault)
257	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
258	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
259	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
260	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault)
261	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
262	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
263
264	b	.Lcopy_loop_8
265	ldo	-16(len),len
266
267.Lbyte_loop:
268	cmpclr,COND(<>) len,%r0,%r0
269	b,n	.Lcopy_done
27020:	ldb	0(srcspc,src),t1
271	ldo	1(src),src
27221:	stb,ma	t1,1(dstspc,dst)
273	b	.Lbyte_loop
274	ldo	-1(len),len
275
276	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
277	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
278
279.Lcopy_done:
280	bv	%r0(%r2)
281	sub	end,dst,ret0
282
283
284	/* src and dst are not aligned the same way. */
285	/* need to go the hard way */
286.Lunaligned_copy:
287	/* align until dst is 32bit-word-aligned */
288	extru	dst,31,2,t1
289	cmpib,=,n	0,t1,.Lcopy_dstaligned
29020:	ldb	0(srcspc,src),t1
291	ldo	1(src),src
29221:	stb,ma	t1,1(dstspc,dst)
293	b	.Lunaligned_copy
294	ldo	-1(len),len
295
296	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
297	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
298
299.Lcopy_dstaligned:
300
301	/* store src, dst and len in safe place */
302	copy	src,save_src
303	copy	dst,save_dst
304	copy	len,save_len
305
306	/* len now needs give number of words to copy */
307	SHRREG	len,2,len
308
309	/*
310	 * Copy from a not-aligned src to an aligned dst using shifts.
311	 * Handles 4 words per loop.
312	 */
313
314	depw,z src,28,2,t0
315	subi 32,t0,t0
316	mtsar t0
317	extru len,31,2,t0
318	cmpib,= 2,t0,.Lcase2
319	/* Make src aligned by rounding it down.  */
320	depi 0,31,2,src
321
322	cmpiclr,<> 3,t0,%r0
323	b,n .Lcase3
324	cmpiclr,<> 1,t0,%r0
325	b,n .Lcase1
326.Lcase0:
327	cmpb,COND(=) %r0,len,.Lcda_finish
328	nop
329
3301:	ldw,ma 4(srcspc,src), a3
331	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3321:	ldw,ma 4(srcspc,src), a0
333	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
334	b,n .Ldo3
335.Lcase1:
3361:	ldw,ma 4(srcspc,src), a2
337	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3381:	ldw,ma 4(srcspc,src), a3
339	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
340	ldo -1(len),len
341	cmpb,COND(=),n %r0,len,.Ldo0
342.Ldo4:
3431:	ldw,ma 4(srcspc,src), a0
344	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
345	shrpw a2, a3, %sar, t0
3461:	stw,ma t0, 4(dstspc,dst)
347	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
348.Ldo3:
3491:	ldw,ma 4(srcspc,src), a1
350	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
351	shrpw a3, a0, %sar, t0
3521:	stw,ma t0, 4(dstspc,dst)
353	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
354.Ldo2:
3551:	ldw,ma 4(srcspc,src), a2
356	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
357	shrpw a0, a1, %sar, t0
3581:	stw,ma t0, 4(dstspc,dst)
359	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
360.Ldo1:
3611:	ldw,ma 4(srcspc,src), a3
362	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
363	shrpw a1, a2, %sar, t0
3641:	stw,ma t0, 4(dstspc,dst)
365	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
366	ldo -4(len),len
367	cmpb,COND(<>) %r0,len,.Ldo4
368	nop
369.Ldo0:
370	shrpw a2, a3, %sar, t0
3711:	stw,ma t0, 4(dstspc,dst)
372	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
373
374.Lcda_rdfault:
375.Lcda_finish:
376	/* calculate new src, dst and len and jump to byte-copy loop */
377	sub	dst,save_dst,t0
378	add	save_src,t0,src
379	b	.Lbyte_loop
380	sub	save_len,t0,len
381
382.Lcase3:
3831:	ldw,ma 4(srcspc,src), a0
384	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3851:	ldw,ma 4(srcspc,src), a1
386	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
387	b .Ldo2
388	ldo 1(len),len
389.Lcase2:
3901:	ldw,ma 4(srcspc,src), a1
391	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3921:	ldw,ma 4(srcspc,src), a2
393	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
394	b .Ldo1
395	ldo 2(len),len
396
397
398	/* fault exception fixup handlers: */
399#ifdef CONFIG_64BIT
400.Lcopy16_fault:
401	b	.Lcopy_done
40210:	std,ma	t1,8(dstspc,dst)
403	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
404#endif
405
406.Lcopy8_fault:
407	b	.Lcopy_done
40810:	stw,ma	t1,4(dstspc,dst)
409	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
410ENDPROC_CFI(pa_memcpy)
411
412	.end
413