xref: /openbmc/linux/arch/parisc/lib/memcpy.c (revision 87c2ce3b)
1 /*
2  *    Optimized memory copy routines.
3  *
4  *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5  *
6  *    This program is free software; you can redistribute it and/or modify
7  *    it under the terms of the GNU General Public License as published by
8  *    the Free Software Foundation; either version 2, or (at your option)
9  *    any later version.
10  *
11  *    This program is distributed in the hope that it will be useful,
12  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *    GNU General Public License for more details.
15  *
16  *    You should have received a copy of the GNU General Public License
17  *    along with this program; if not, write to the Free Software
18  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  *
20  *    Portions derived from the GNU C Library
21  *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
22  *
23  * Several strategies are tried to try to get the best performance for various
24  * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
25  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
26  * general registers.  Unaligned copies are handled either by aligning the
27  * destination and then using shift-and-write method, or in a few cases by
28  * falling back to a byte-at-a-time copy.
29  *
30  * I chose to implement this in C because it is easier to maintain and debug,
31  * and in my experiments it appears that the C code generated by gcc (3.3/3.4
32  * at the time of writing) is fairly optimal. Unfortunately some of the
33  * semantics of the copy routine (exception handling) is difficult to express
34  * in C, so we have to play some tricks to get it to work.
35  *
36  * All the loads and stores are done via explicit asm() code in order to use
37  * the right space registers.
38  *
39  * Testing with various alignments and buffer sizes shows that this code is
40  * often >10x faster than a simple byte-at-a-time copy, even for strangely
41  * aligned operands. It is interesting to note that the glibc version
42  * of memcpy (written in C) is actually quite fast already. This routine is
43  * able to beat it by 30-40% for aligned copies because of the loop unrolling,
44  * but in some cases the glibc version is still slightly faster. This lends
45  * more credibility that gcc can generate very good code as long as we are
46  * careful.
47  *
48  * TODO:
49  * - cache prefetching needs more experimentation to get optimal settings
50  * - try not to use the post-increment address modifiers; they create additional
51  *   interlocks
52  * - replace byte-copy loops with stybs sequences
53  */
54 
55 #ifdef __KERNEL__
56 #include <linux/config.h>
57 #include <linux/module.h>
58 #include <linux/compiler.h>
59 #include <asm/uaccess.h>
60 #define s_space "%%sr1"
61 #define d_space "%%sr2"
62 #else
63 #include "memcpy.h"
64 #define s_space "%%sr0"
65 #define d_space "%%sr0"
66 #define pa_memcpy new2_copy
67 #endif
68 
69 DECLARE_PER_CPU(struct exception_data, exception_data);
70 
71 #define preserve_branch(label)	do {					\
72 	volatile int dummy;						\
73 	/* The following branch is never taken, it's just here to  */	\
74 	/* prevent gcc from optimizing away our exception code. */ 	\
75 	if (unlikely(dummy != dummy))					\
76 		goto label;						\
77 } while (0)
78 
79 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
80 #define get_kernel_space() (0)
81 
82 #define MERGE(w0, sh_1, w1, sh_2)  ({					\
83 	unsigned int _r;						\
84 	asm volatile (							\
85 	"mtsar %3\n"							\
86 	"shrpw %1, %2, %%sar, %0\n"					\
87 	: "=r"(_r)							\
88 	: "r"(w0), "r"(w1), "r"(sh_2)					\
89 	);								\
90 	_r;								\
91 })
92 #define THRESHOLD	16
93 
94 #ifdef DEBUG_MEMCPY
95 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
96 #else
97 #define DPRINTF(fmt, args...)
98 #endif
99 
100 #ifndef __LP64__
101 #define EXC_WORD ".word"
102 #else
103 #define EXC_WORD ".dword"
104 #endif
105 
106 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
107 	__asm__ __volatile__ (				\
108 	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" 	\
109 	"\t.section __ex_table,\"aw\"\n"		\
110 	"\t" EXC_WORD "\t1b\n"				\
111 	"\t" EXC_WORD "\t" #_e "\n"			\
112 	"\t.previous\n"					\
113 	: _tt(_t), "+r"(_a)				\
114 	: 						\
115 	: "r8")
116 
117 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
118 	__asm__ __volatile__ (				\
119 	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" 	\
120 	"\t.section __ex_table,\"aw\"\n"		\
121 	"\t" EXC_WORD "\t1b\n"				\
122 	"\t" EXC_WORD "\t" #_e "\n"			\
123 	"\t.previous\n"					\
124 	: "+r"(_a) 					\
125 	: _tt(_t)					\
126 	: "r8")
127 
128 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
129 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
130 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
131 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
132 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
133 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
134 
135 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
136 	__asm__ __volatile__ (				\
137 	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n"	\
138 	"\t.section __ex_table,\"aw\"\n"		\
139 	"\t" EXC_WORD "\t1b\n"				\
140 	"\t" EXC_WORD "\t" #_e "\n"			\
141 	"\t.previous\n"					\
142 	: _tt(_t) 					\
143 	: "r"(_a)					\
144 	: "r8")
145 
146 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
147 	__asm__ __volatile__ (				\
148 	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" 	\
149 	"\t.section __ex_table,\"aw\"\n"		\
150 	"\t" EXC_WORD "\t1b\n"				\
151 	"\t" EXC_WORD "\t" #_e "\n"			\
152 	"\t.previous\n"					\
153 	: 						\
154 	: _tt(_t), "r"(_a)				\
155 	: "r8")
156 
157 #define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
158 #define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)
159 
160 #ifdef  CONFIG_PREFETCH
161 extern inline void prefetch_src(const void *addr)
162 {
163 	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
164 }
165 
166 extern inline void prefetch_dst(const void *addr)
167 {
168 	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
169 }
170 #else
171 #define prefetch_src(addr)
172 #define prefetch_dst(addr)
173 #endif
174 
175 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
176  * per loop.  This code is derived from glibc.
177  */
178 static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
179 {
180 	/* gcc complains that a2 and a3 may be uninitialized, but actually
181 	 * they cannot be.  Initialize a2/a3 to shut gcc up.
182 	 */
183 	register unsigned int a0, a1, a2 = 0, a3 = 0;
184 	int sh_1, sh_2;
185 	struct exception_data *d;
186 
187 	/* prefetch_src((const void *)src); */
188 
189 	/* Calculate how to shift a word read at the memory operation
190 	   aligned srcp to make it aligned for copy.  */
191 	sh_1 = 8 * (src % sizeof(unsigned int));
192 	sh_2 = 8 * sizeof(unsigned int) - sh_1;
193 
194 	/* Make src aligned by rounding it down.  */
195 	src &= -sizeof(unsigned int);
196 
197 	switch (len % 4)
198 	{
199 		case 2:
200 			/* a1 = ((unsigned int *) src)[0];
201 			   a2 = ((unsigned int *) src)[1]; */
202 			ldw(s_space, 0, src, a1, cda_ldw_exc);
203 			ldw(s_space, 4, src, a2, cda_ldw_exc);
204 			src -= 1 * sizeof(unsigned int);
205 			dst -= 3 * sizeof(unsigned int);
206 			len += 2;
207 			goto do1;
208 		case 3:
209 			/* a0 = ((unsigned int *) src)[0];
210 			   a1 = ((unsigned int *) src)[1]; */
211 			ldw(s_space, 0, src, a0, cda_ldw_exc);
212 			ldw(s_space, 4, src, a1, cda_ldw_exc);
213 			src -= 0 * sizeof(unsigned int);
214 			dst -= 2 * sizeof(unsigned int);
215 			len += 1;
216 			goto do2;
217 		case 0:
218 			if (len == 0)
219 				return 0;
220 			/* a3 = ((unsigned int *) src)[0];
221 			   a0 = ((unsigned int *) src)[1]; */
222 			ldw(s_space, 0, src, a3, cda_ldw_exc);
223 			ldw(s_space, 4, src, a0, cda_ldw_exc);
224 			src -=-1 * sizeof(unsigned int);
225 			dst -= 1 * sizeof(unsigned int);
226 			len += 0;
227 			goto do3;
228 		case 1:
229 			/* a2 = ((unsigned int *) src)[0];
230 			   a3 = ((unsigned int *) src)[1]; */
231 			ldw(s_space, 0, src, a2, cda_ldw_exc);
232 			ldw(s_space, 4, src, a3, cda_ldw_exc);
233 			src -=-2 * sizeof(unsigned int);
234 			dst -= 0 * sizeof(unsigned int);
235 			len -= 1;
236 			if (len == 0)
237 				goto do0;
238 			goto do4;			/* No-op.  */
239 	}
240 
241 	do
242 	{
243 		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
244 do4:
245 		/* a0 = ((unsigned int *) src)[0]; */
246 		ldw(s_space, 0, src, a0, cda_ldw_exc);
247 		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
248 		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
249 do3:
250 		/* a1 = ((unsigned int *) src)[1]; */
251 		ldw(s_space, 4, src, a1, cda_ldw_exc);
252 		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
253 		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
254 do2:
255 		/* a2 = ((unsigned int *) src)[2]; */
256 		ldw(s_space, 8, src, a2, cda_ldw_exc);
257 		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
258 		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
259 do1:
260 		/* a3 = ((unsigned int *) src)[3]; */
261 		ldw(s_space, 12, src, a3, cda_ldw_exc);
262 		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
263 		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
264 
265 		src += 4 * sizeof(unsigned int);
266 		dst += 4 * sizeof(unsigned int);
267 		len -= 4;
268 	}
269 	while (len != 0);
270 
271 do0:
272 	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
273 	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
274 
275 	preserve_branch(handle_load_error);
276 	preserve_branch(handle_store_error);
277 
278 	return 0;
279 
280 handle_load_error:
281 	__asm__ __volatile__ ("cda_ldw_exc:\n");
282 	d = &__get_cpu_var(exception_data);
283 	DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
284 		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
285 	return o_len * 4 - d->fault_addr + o_src;
286 
287 handle_store_error:
288 	__asm__ __volatile__ ("cda_stw_exc:\n");
289 	d = &__get_cpu_var(exception_data);
290 	DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
291 		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
292 	return o_len * 4 - d->fault_addr + o_dst;
293 }
294 
295 
296 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
297 unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
298 {
299 	register unsigned long src, dst, t1, t2, t3;
300 	register unsigned char *pcs, *pcd;
301 	register unsigned int *pws, *pwd;
302 	register double *pds, *pdd;
303 	unsigned long ret = 0;
304 	unsigned long o_dst, o_src, o_len;
305 	struct exception_data *d;
306 
307 	src = (unsigned long)srcp;
308 	dst = (unsigned long)dstp;
309 	pcs = (unsigned char *)srcp;
310 	pcd = (unsigned char *)dstp;
311 
312 	o_dst = dst; o_src = src; o_len = len;
313 
314 	/* prefetch_src((const void *)srcp); */
315 
316 	if (len < THRESHOLD)
317 		goto byte_copy;
318 
319 	/* Check alignment */
320 	t1 = (src ^ dst);
321 	if (unlikely(t1 & (sizeof(double)-1)))
322 		goto unaligned_copy;
323 
324 	/* src and dst have same alignment. */
325 
326 	/* Copy bytes till we are double-aligned. */
327 	t2 = src & (sizeof(double) - 1);
328 	if (unlikely(t2 != 0)) {
329 		t2 = sizeof(double) - t2;
330 		while (t2 && len) {
331 			/* *pcd++ = *pcs++; */
332 			ldbma(s_space, pcs, t3, pmc_load_exc);
333 			len--;
334 			stbma(d_space, t3, pcd, pmc_store_exc);
335 			t2--;
336 		}
337 	}
338 
339 	pds = (double *)pcs;
340 	pdd = (double *)pcd;
341 
342 #if 0
343 	/* Copy 8 doubles at a time */
344 	while (len >= 8*sizeof(double)) {
345 		register double r1, r2, r3, r4, r5, r6, r7, r8;
346 		/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
347 		flddma(s_space, pds, r1, pmc_load_exc);
348 		flddma(s_space, pds, r2, pmc_load_exc);
349 		flddma(s_space, pds, r3, pmc_load_exc);
350 		flddma(s_space, pds, r4, pmc_load_exc);
351 		fstdma(d_space, r1, pdd, pmc_store_exc);
352 		fstdma(d_space, r2, pdd, pmc_store_exc);
353 		fstdma(d_space, r3, pdd, pmc_store_exc);
354 		fstdma(d_space, r4, pdd, pmc_store_exc);
355 
356 #if 0
357 		if (L1_CACHE_BYTES <= 32)
358 			prefetch_src((char *)pds + L1_CACHE_BYTES);
359 #endif
360 		flddma(s_space, pds, r5, pmc_load_exc);
361 		flddma(s_space, pds, r6, pmc_load_exc);
362 		flddma(s_space, pds, r7, pmc_load_exc);
363 		flddma(s_space, pds, r8, pmc_load_exc);
364 		fstdma(d_space, r5, pdd, pmc_store_exc);
365 		fstdma(d_space, r6, pdd, pmc_store_exc);
366 		fstdma(d_space, r7, pdd, pmc_store_exc);
367 		fstdma(d_space, r8, pdd, pmc_store_exc);
368 		len -= 8*sizeof(double);
369 	}
370 #endif
371 
372 	pws = (unsigned int *)pds;
373 	pwd = (unsigned int *)pdd;
374 
375 word_copy:
376 	while (len >= 8*sizeof(unsigned int)) {
377 		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
378 		/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
379 		ldwma(s_space, pws, r1, pmc_load_exc);
380 		ldwma(s_space, pws, r2, pmc_load_exc);
381 		ldwma(s_space, pws, r3, pmc_load_exc);
382 		ldwma(s_space, pws, r4, pmc_load_exc);
383 		stwma(d_space, r1, pwd, pmc_store_exc);
384 		stwma(d_space, r2, pwd, pmc_store_exc);
385 		stwma(d_space, r3, pwd, pmc_store_exc);
386 		stwma(d_space, r4, pwd, pmc_store_exc);
387 
388 		ldwma(s_space, pws, r5, pmc_load_exc);
389 		ldwma(s_space, pws, r6, pmc_load_exc);
390 		ldwma(s_space, pws, r7, pmc_load_exc);
391 		ldwma(s_space, pws, r8, pmc_load_exc);
392 		stwma(d_space, r5, pwd, pmc_store_exc);
393 		stwma(d_space, r6, pwd, pmc_store_exc);
394 		stwma(d_space, r7, pwd, pmc_store_exc);
395 		stwma(d_space, r8, pwd, pmc_store_exc);
396 		len -= 8*sizeof(unsigned int);
397 	}
398 
399 	while (len >= 4*sizeof(unsigned int)) {
400 		register unsigned int r1,r2,r3,r4;
401 		ldwma(s_space, pws, r1, pmc_load_exc);
402 		ldwma(s_space, pws, r2, pmc_load_exc);
403 		ldwma(s_space, pws, r3, pmc_load_exc);
404 		ldwma(s_space, pws, r4, pmc_load_exc);
405 		stwma(d_space, r1, pwd, pmc_store_exc);
406 		stwma(d_space, r2, pwd, pmc_store_exc);
407 		stwma(d_space, r3, pwd, pmc_store_exc);
408 		stwma(d_space, r4, pwd, pmc_store_exc);
409 		len -= 4*sizeof(unsigned int);
410 	}
411 
412 	pcs = (unsigned char *)pws;
413 	pcd = (unsigned char *)pwd;
414 
415 byte_copy:
416 	while (len) {
417 		/* *pcd++ = *pcs++; */
418 		ldbma(s_space, pcs, t3, pmc_load_exc);
419 		stbma(d_space, t3, pcd, pmc_store_exc);
420 		len--;
421 	}
422 
423 	return 0;
424 
425 unaligned_copy:
426 	/* possibly we are aligned on a word, but not on a double... */
427 	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
428 		t2 = src & (sizeof(unsigned int) - 1);
429 
430 		if (unlikely(t2 != 0)) {
431 			t2 = sizeof(unsigned int) - t2;
432 			while (t2) {
433 				/* *pcd++ = *pcs++; */
434 				ldbma(s_space, pcs, t3, pmc_load_exc);
435 				stbma(d_space, t3, pcd, pmc_store_exc);
436 				len--;
437 				t2--;
438 			}
439 		}
440 
441 		pws = (unsigned int *)pcs;
442 		pwd = (unsigned int *)pcd;
443 		goto word_copy;
444 	}
445 
446 	/* Align the destination.  */
447 	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
448 		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
449 		while (t2) {
450 			/* *pcd++ = *pcs++; */
451 			ldbma(s_space, pcs, t3, pmc_load_exc);
452 			stbma(d_space, t3, pcd, pmc_store_exc);
453 			len--;
454 			t2--;
455 		}
456 		dst = (unsigned long)pcd;
457 		src = (unsigned long)pcs;
458 	}
459 
460 	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
461 		o_dst, o_src, o_len);
462 	if (ret)
463 		return ret;
464 
465 	pcs += (len & -sizeof(unsigned int));
466 	pcd += (len & -sizeof(unsigned int));
467 	len %= sizeof(unsigned int);
468 
469 	preserve_branch(handle_load_error);
470 	preserve_branch(handle_store_error);
471 
472 	goto byte_copy;
473 
474 handle_load_error:
475 	__asm__ __volatile__ ("pmc_load_exc:\n");
476 	d = &__get_cpu_var(exception_data);
477 	DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
478 		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
479 	return o_len - d->fault_addr + o_src;
480 
481 handle_store_error:
482 	__asm__ __volatile__ ("pmc_store_exc:\n");
483 	d = &__get_cpu_var(exception_data);
484 	DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
485 		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
486 	return o_len - d->fault_addr + o_dst;
487 }
488 
489 #ifdef __KERNEL__
490 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
491 {
492 	mtsp(get_kernel_space(), 1);
493 	mtsp(get_user_space(), 2);
494 	return pa_memcpy((void __force *)dst, src, len);
495 }
496 
497 unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
498 {
499 	mtsp(get_user_space(), 1);
500 	mtsp(get_kernel_space(), 2);
501 	return pa_memcpy(dst, (void __force *)src, len);
502 }
503 
504 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
505 {
506 	mtsp(get_user_space(), 1);
507 	mtsp(get_user_space(), 2);
508 	return pa_memcpy((void __force *)dst, (void __force *)src, len);
509 }
510 
511 
512 void * memcpy(void * dst,const void *src, size_t count)
513 {
514 	mtsp(get_kernel_space(), 1);
515 	mtsp(get_kernel_space(), 2);
516 	pa_memcpy(dst, src, count);
517 	return dst;
518 }
519 
520 EXPORT_SYMBOL(copy_to_user);
521 EXPORT_SYMBOL(copy_from_user);
522 EXPORT_SYMBOL(copy_in_user);
523 EXPORT_SYMBOL(memcpy);
524 #endif
525