xref: /openbmc/linux/arch/parisc/lib/lusercopy.S (revision 4da722ca19f30f7db250db808d1ab1703607a932)
1/*
2 *    User Space Access Routines
3 *
4 *    Copyright (C) 2000-2002 Hewlett-Packard (John Marvin)
5 *    Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
6 *    Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
7 *    Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
8 *    Copyright (C) 2017 Helge Deller <deller@gmx.de>
9 *    Copyright (C) 2017 John David Anglin <dave.anglin@bell.net>
10 *
11 *
12 *    This program is free software; you can redistribute it and/or modify
13 *    it under the terms of the GNU General Public License as published by
14 *    the Free Software Foundation; either version 2, or (at your option)
15 *    any later version.
16 *
17 *    This program is distributed in the hope that it will be useful,
18 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
19 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 *    GNU General Public License for more details.
21 *
22 *    You should have received a copy of the GNU General Public License
23 *    along with this program; if not, write to the Free Software
24 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25 */
26
27/*
28 * These routines still have plenty of room for optimization
29 * (word & doubleword load/store, dual issue, store hints, etc.).
30 */
31
32/*
33 * The following routines assume that space register 3 (sr3) contains
34 * the space id associated with the current users address space.
35 */
36
37
38	.text
39
40#include <asm/assembly.h>
41#include <asm/errno.h>
42#include <linux/linkage.h>
43
44	/*
45	 * get_sr gets the appropriate space value into
46	 * sr1 for kernel/user space access, depending
47	 * on the flag stored in the task structure.
48	 */
49
50	.macro  get_sr
51	mfctl       %cr30,%r1
52	ldw         TI_SEGMENT(%r1),%r22
53	mfsp        %sr3,%r1
54	or,<>       %r22,%r0,%r0
55	copy        %r0,%r1
56	mtsp        %r1,%sr1
57	.endm
58
59	/*
60	 * unsigned long lclear_user(void *to, unsigned long n)
61	 *
62	 * Returns 0 for success.
63	 * otherwise, returns number of bytes not transferred.
64	 */
65
66ENTRY_CFI(lclear_user)
67	.proc
68	.callinfo NO_CALLS
69	.entry
70	comib,=,n   0,%r25,$lclu_done
71	get_sr
72$lclu_loop:
73	addib,<>    -1,%r25,$lclu_loop
741:      stbs,ma     %r0,1(%sr1,%r26)
75
76$lclu_done:
77	bv          %r0(%r2)
78	copy        %r25,%r28
79
802:	b           $lclu_done
81	ldo         1(%r25),%r25
82
83	ASM_EXCEPTIONTABLE_ENTRY(1b,2b)
84
85	.exit
86ENDPROC_CFI(lclear_user)
87
88
89	.procend
90
91	/*
92	 * long lstrnlen_user(char *s, long n)
93	 *
94	 * Returns 0 if exception before zero byte or reaching N,
95	 *         N+1 if N would be exceeded,
96	 *         else strlen + 1 (i.e. includes zero byte).
97	 */
98
99ENTRY_CFI(lstrnlen_user)
100	.proc
101	.callinfo NO_CALLS
102	.entry
103	comib,=     0,%r25,$lslen_nzero
104	copy	    %r26,%r24
105	get_sr
1061:      ldbs,ma     1(%sr1,%r26),%r1
107$lslen_loop:
108	comib,=,n   0,%r1,$lslen_done
109	addib,<>    -1,%r25,$lslen_loop
1102:      ldbs,ma     1(%sr1,%r26),%r1
111$lslen_done:
112	bv          %r0(%r2)
113	sub	    %r26,%r24,%r28
114	.exit
115
116$lslen_nzero:
117	b           $lslen_done
118	ldo         1(%r26),%r26 /* special case for N == 0 */
119
1203:      b	    $lslen_done
121	copy        %r24,%r26    /* reset r26 so 0 is returned on fault */
122
123	ASM_EXCEPTIONTABLE_ENTRY(1b,3b)
124	ASM_EXCEPTIONTABLE_ENTRY(2b,3b)
125
126ENDPROC_CFI(lstrnlen_user)
127
128	.procend
129
130
131
132/*
133 * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
134 *
135 * Inputs:
136 * - sr1 already contains space of source region
137 * - sr2 already contains space of destination region
138 *
139 * Returns:
140 * - number of bytes that could not be copied.
141 *   On success, this will be zero.
142 *
143 * This code is based on a C-implementation of a copy routine written by
144 * Randolph Chung, which in turn was derived from the glibc.
145 *
146 * Several strategies are tried to try to get the best performance for various
147 * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes
148 * at a time using general registers.  Unaligned copies are handled either by
149 * aligning the destination and then using shift-and-write method, or in a few
150 * cases by falling back to a byte-at-a-time copy.
151 *
152 * Testing with various alignments and buffer sizes shows that this code is
153 * often >10x faster than a simple byte-at-a-time copy, even for strangely
154 * aligned operands. It is interesting to note that the glibc version of memcpy
155 * (written in C) is actually quite fast already. This routine is able to beat
156 * it by 30-40% for aligned copies because of the loop unrolling, but in some
157 * cases the glibc version is still slightly faster. This lends more
158 * credibility that gcc can generate very good code as long as we are careful.
159 *
160 * Possible optimizations:
161 * - add cache prefetching
162 * - try not to use the post-increment address modifiers; they may create
163 *   additional interlocks. Assumption is that those were only efficient on old
164 *   machines (pre PA8000 processors)
165 */
166
167	dst = arg0
168	src = arg1
169	len = arg2
170	end = arg3
171	t1  = r19
172	t2  = r20
173	t3  = r21
174	t4  = r22
175	srcspc = sr1
176	dstspc = sr2
177
178	t0 = r1
179	a1 = t1
180	a2 = t2
181	a3 = t3
182	a0 = t4
183
184	save_src = ret0
185	save_dst = ret1
186	save_len = r31
187
188ENTRY_CFI(pa_memcpy)
189	.proc
190	.callinfo NO_CALLS
191	.entry
192
193	/* Last destination address */
194	add	dst,len,end
195
196	/* short copy with less than 16 bytes? */
197	cmpib,COND(>>=),n 15,len,.Lbyte_loop
198
199	/* same alignment? */
200	xor	src,dst,t0
201	extru	t0,31,2,t1
202	cmpib,<>,n  0,t1,.Lunaligned_copy
203
204#ifdef CONFIG_64BIT
205	/* only do 64-bit copies if we can get aligned. */
206	extru	t0,31,3,t1
207	cmpib,<>,n  0,t1,.Lalign_loop32
208
209	/* loop until we are 64-bit aligned */
210.Lalign_loop64:
211	extru	dst,31,3,t1
212	cmpib,=,n	0,t1,.Lcopy_loop_16_start
21320:	ldb,ma	1(srcspc,src),t1
21421:	stb,ma	t1,1(dstspc,dst)
215	b	.Lalign_loop64
216	ldo	-1(len),len
217
218	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
219	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
220
221.Lcopy_loop_16_start:
222	ldi	31,t0
223.Lcopy_loop_16:
224	cmpb,COND(>>=),n t0,len,.Lword_loop
225
22610:	ldd	0(srcspc,src),t1
22711:	ldd	8(srcspc,src),t2
228	ldo	16(src),src
22912:	std,ma	t1,8(dstspc,dst)
23013:	std,ma	t2,8(dstspc,dst)
23114:	ldd	0(srcspc,src),t1
23215:	ldd	8(srcspc,src),t2
233	ldo	16(src),src
23416:	std,ma	t1,8(dstspc,dst)
23517:	std,ma	t2,8(dstspc,dst)
236
237	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
238	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault)
239	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
240	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
241	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
242	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault)
243	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
244	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
245
246	b	.Lcopy_loop_16
247	ldo	-32(len),len
248
249.Lword_loop:
250	cmpib,COND(>>=),n 3,len,.Lbyte_loop
25120:	ldw,ma	4(srcspc,src),t1
25221:	stw,ma	t1,4(dstspc,dst)
253	b	.Lword_loop
254	ldo	-4(len),len
255
256	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
257	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
258
259#endif /* CONFIG_64BIT */
260
261	/* loop until we are 32-bit aligned */
262.Lalign_loop32:
263	extru	dst,31,2,t1
264	cmpib,=,n	0,t1,.Lcopy_loop_8
26520:	ldb,ma	1(srcspc,src),t1
26621:	stb,ma	t1,1(dstspc,dst)
267	b	.Lalign_loop32
268	ldo	-1(len),len
269
270	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
271	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
272
273
274.Lcopy_loop_8:
275	cmpib,COND(>>=),n 15,len,.Lbyte_loop
276
27710:	ldw	0(srcspc,src),t1
27811:	ldw	4(srcspc,src),t2
27912:	stw,ma	t1,4(dstspc,dst)
28013:	stw,ma	t2,4(dstspc,dst)
28114:	ldw	8(srcspc,src),t1
28215:	ldw	12(srcspc,src),t2
283	ldo	16(src),src
28416:	stw,ma	t1,4(dstspc,dst)
28517:	stw,ma	t2,4(dstspc,dst)
286
287	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
288	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault)
289	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
290	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
291	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
292	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault)
293	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
294	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
295
296	b	.Lcopy_loop_8
297	ldo	-16(len),len
298
299.Lbyte_loop:
300	cmpclr,COND(<>) len,%r0,%r0
301	b,n	.Lcopy_done
30220:	ldb	0(srcspc,src),t1
303	ldo	1(src),src
30421:	stb,ma	t1,1(dstspc,dst)
305	b	.Lbyte_loop
306	ldo	-1(len),len
307
308	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
309	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
310
311.Lcopy_done:
312	bv	%r0(%r2)
313	sub	end,dst,ret0
314
315
316	/* src and dst are not aligned the same way. */
317	/* need to go the hard way */
318.Lunaligned_copy:
319	/* align until dst is 32bit-word-aligned */
320	extru	dst,31,2,t1
321	cmpib,=,n	0,t1,.Lcopy_dstaligned
32220:	ldb	0(srcspc,src),t1
323	ldo	1(src),src
32421:	stb,ma	t1,1(dstspc,dst)
325	b	.Lunaligned_copy
326	ldo	-1(len),len
327
328	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
329	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
330
331.Lcopy_dstaligned:
332
333	/* store src, dst and len in safe place */
334	copy	src,save_src
335	copy	dst,save_dst
336	copy	len,save_len
337
338	/* len now needs give number of words to copy */
339	SHRREG	len,2,len
340
341	/*
342	 * Copy from a not-aligned src to an aligned dst using shifts.
343	 * Handles 4 words per loop.
344	 */
345
346	depw,z src,28,2,t0
347	subi 32,t0,t0
348	mtsar t0
349	extru len,31,2,t0
350	cmpib,= 2,t0,.Lcase2
351	/* Make src aligned by rounding it down.  */
352	depi 0,31,2,src
353
354	cmpiclr,<> 3,t0,%r0
355	b,n .Lcase3
356	cmpiclr,<> 1,t0,%r0
357	b,n .Lcase1
358.Lcase0:
359	cmpb,COND(=) %r0,len,.Lcda_finish
360	nop
361
3621:	ldw,ma 4(srcspc,src), a3
363	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3641:	ldw,ma 4(srcspc,src), a0
365	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
366	b,n .Ldo3
367.Lcase1:
3681:	ldw,ma 4(srcspc,src), a2
369	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3701:	ldw,ma 4(srcspc,src), a3
371	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
372	ldo -1(len),len
373	cmpb,COND(=),n %r0,len,.Ldo0
374.Ldo4:
3751:	ldw,ma 4(srcspc,src), a0
376	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
377	shrpw a2, a3, %sar, t0
3781:	stw,ma t0, 4(dstspc,dst)
379	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
380.Ldo3:
3811:	ldw,ma 4(srcspc,src), a1
382	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
383	shrpw a3, a0, %sar, t0
3841:	stw,ma t0, 4(dstspc,dst)
385	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
386.Ldo2:
3871:	ldw,ma 4(srcspc,src), a2
388	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
389	shrpw a0, a1, %sar, t0
3901:	stw,ma t0, 4(dstspc,dst)
391	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
392.Ldo1:
3931:	ldw,ma 4(srcspc,src), a3
394	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
395	shrpw a1, a2, %sar, t0
3961:	stw,ma t0, 4(dstspc,dst)
397	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
398	ldo -4(len),len
399	cmpb,COND(<>) %r0,len,.Ldo4
400	nop
401.Ldo0:
402	shrpw a2, a3, %sar, t0
4031:	stw,ma t0, 4(dstspc,dst)
404	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
405
406.Lcda_rdfault:
407.Lcda_finish:
408	/* calculate new src, dst and len and jump to byte-copy loop */
409	sub	dst,save_dst,t0
410	add	save_src,t0,src
411	b	.Lbyte_loop
412	sub	save_len,t0,len
413
414.Lcase3:
4151:	ldw,ma 4(srcspc,src), a0
416	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
4171:	ldw,ma 4(srcspc,src), a1
418	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
419	b .Ldo2
420	ldo 1(len),len
421.Lcase2:
4221:	ldw,ma 4(srcspc,src), a1
423	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
4241:	ldw,ma 4(srcspc,src), a2
425	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
426	b .Ldo1
427	ldo 2(len),len
428
429
430	/* fault exception fixup handlers: */
431#ifdef CONFIG_64BIT
432.Lcopy16_fault:
433	b	.Lcopy_done
43410:	std,ma	t1,8(dstspc,dst)
435	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
436#endif
437
438.Lcopy8_fault:
439	b	.Lcopy_done
44010:	stw,ma	t1,4(dstspc,dst)
441	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
442
443	.exit
444ENDPROC_CFI(pa_memcpy)
445	.procend
446
447	.end
448