xref: /openbmc/linux/arch/mips/lib/csum_partial.S (revision 6c870213d6f3a25981c10728f46294a3bed1703f)
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License.  See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Quick'n'dirty IP checksum ...
7 *
8 * Copyright (C) 1998, 1999 Ralf Baechle
9 * Copyright (C) 1999 Silicon Graphics, Inc.
10 * Copyright (C) 2007  Maciej W. Rozycki
11 * Copyright (C) 2014 Imagination Technologies Ltd.
12 */
13#include <linux/errno.h>
14#include <asm/asm.h>
15#include <asm/asm-offsets.h>
16#include <asm/regdef.h>
17
18#ifdef CONFIG_64BIT
19/*
20 * As we are sharing code base with the mips32 tree (which use the o32 ABI
21 * register definitions). We need to redefine the register definitions from
22 * the n64 ABI register naming to the o32 ABI register naming.
23 */
24#undef t0
25#undef t1
26#undef t2
27#undef t3
28#define t0	$8
29#define t1	$9
30#define t2	$10
31#define t3	$11
32#define t4	$12
33#define t5	$13
34#define t6	$14
35#define t7	$15
36
37#define USE_DOUBLE
38#endif
39
40#ifdef USE_DOUBLE
41
42#define LOAD   ld
43#define LOAD32 lwu
44#define ADD    daddu
45#define NBYTES 8
46
47#else
48
49#define LOAD   lw
50#define LOAD32 lw
51#define ADD    addu
52#define NBYTES 4
53
54#endif /* USE_DOUBLE */
55
56#define UNIT(unit)  ((unit)*NBYTES)
57
58#define ADDC(sum,reg)						\
59	ADD	sum, reg;					\
60	sltu	v1, sum, reg;					\
61	ADD	sum, v1;					\
62
63#define ADDC32(sum,reg)						\
64	addu	sum, reg;					\
65	sltu	v1, sum, reg;					\
66	addu	sum, v1;					\
67
68#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
69	LOAD	_t0, (offset + UNIT(0))(src);			\
70	LOAD	_t1, (offset + UNIT(1))(src);			\
71	LOAD	_t2, (offset + UNIT(2))(src);			\
72	LOAD	_t3, (offset + UNIT(3))(src);			\
73	ADDC(sum, _t0);						\
74	ADDC(sum, _t1);						\
75	ADDC(sum, _t2);						\
76	ADDC(sum, _t3)
77
78#ifdef USE_DOUBLE
79#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
80	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
81#else
82#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
83	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);	\
84	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
85#endif
86
87/*
88 * a0: source address
89 * a1: length of the area to checksum
90 * a2: partial checksum
91 */
92
93#define src a0
94#define sum v0
95
96	.text
97	.set	noreorder
98	.align	5
99LEAF(csum_partial)
100	move	sum, zero
101	move	t7, zero
102
103	sltiu	t8, a1, 0x8
104	bnez	t8, .Lsmall_csumcpy		/* < 8 bytes to copy */
105	 move	t2, a1
106
107	andi	t7, src, 0x1			/* odd buffer? */
108
109.Lhword_align:
110	beqz	t7, .Lword_align
111	 andi	t8, src, 0x2
112
113	lbu	t0, (src)
114	LONG_SUBU	a1, a1, 0x1
115#ifdef __MIPSEL__
116	sll	t0, t0, 8
117#endif
118	ADDC(sum, t0)
119	PTR_ADDU	src, src, 0x1
120	andi	t8, src, 0x2
121
122.Lword_align:
123	beqz	t8, .Ldword_align
124	 sltiu	t8, a1, 56
125
126	lhu	t0, (src)
127	LONG_SUBU	a1, a1, 0x2
128	ADDC(sum, t0)
129	sltiu	t8, a1, 56
130	PTR_ADDU	src, src, 0x2
131
132.Ldword_align:
133	bnez	t8, .Ldo_end_words
134	 move	t8, a1
135
136	andi	t8, src, 0x4
137	beqz	t8, .Lqword_align
138	 andi	t8, src, 0x8
139
140	LOAD32	t0, 0x00(src)
141	LONG_SUBU	a1, a1, 0x4
142	ADDC(sum, t0)
143	PTR_ADDU	src, src, 0x4
144	andi	t8, src, 0x8
145
146.Lqword_align:
147	beqz	t8, .Loword_align
148	 andi	t8, src, 0x10
149
150#ifdef USE_DOUBLE
151	ld	t0, 0x00(src)
152	LONG_SUBU	a1, a1, 0x8
153	ADDC(sum, t0)
154#else
155	lw	t0, 0x00(src)
156	lw	t1, 0x04(src)
157	LONG_SUBU	a1, a1, 0x8
158	ADDC(sum, t0)
159	ADDC(sum, t1)
160#endif
161	PTR_ADDU	src, src, 0x8
162	andi	t8, src, 0x10
163
164.Loword_align:
165	beqz	t8, .Lbegin_movement
166	 LONG_SRL	t8, a1, 0x7
167
168#ifdef USE_DOUBLE
169	ld	t0, 0x00(src)
170	ld	t1, 0x08(src)
171	ADDC(sum, t0)
172	ADDC(sum, t1)
173#else
174	CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
175#endif
176	LONG_SUBU	a1, a1, 0x10
177	PTR_ADDU	src, src, 0x10
178	LONG_SRL	t8, a1, 0x7
179
180.Lbegin_movement:
181	beqz	t8, 1f
182	 andi	t2, a1, 0x40
183
184.Lmove_128bytes:
185	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
186	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
187	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
188	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
189	LONG_SUBU	t8, t8, 0x01
190	.set	reorder				/* DADDI_WAR */
191	PTR_ADDU	src, src, 0x80
192	bnez	t8, .Lmove_128bytes
193	.set	noreorder
194
1951:
196	beqz	t2, 1f
197	 andi	t2, a1, 0x20
198
199.Lmove_64bytes:
200	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
201	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
202	PTR_ADDU	src, src, 0x40
203
2041:
205	beqz	t2, .Ldo_end_words
206	 andi	t8, a1, 0x1c
207
208.Lmove_32bytes:
209	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
210	andi	t8, a1, 0x1c
211	PTR_ADDU	src, src, 0x20
212
213.Ldo_end_words:
214	beqz	t8, .Lsmall_csumcpy
215	 andi	t2, a1, 0x3
216	LONG_SRL	t8, t8, 0x2
217
218.Lend_words:
219	LOAD32	t0, (src)
220	LONG_SUBU	t8, t8, 0x1
221	ADDC(sum, t0)
222	.set	reorder				/* DADDI_WAR */
223	PTR_ADDU	src, src, 0x4
224	bnez	t8, .Lend_words
225	.set	noreorder
226
227/* unknown src alignment and < 8 bytes to go  */
228.Lsmall_csumcpy:
229	move	a1, t2
230
231	andi	t0, a1, 4
232	beqz	t0, 1f
233	 andi	t0, a1, 2
234
235	/* Still a full word to go  */
236	ulw	t1, (src)
237	PTR_ADDIU	src, 4
238#ifdef USE_DOUBLE
239	dsll	t1, t1, 32			/* clear lower 32bit */
240#endif
241	ADDC(sum, t1)
242
2431:	move	t1, zero
244	beqz	t0, 1f
245	 andi	t0, a1, 1
246
247	/* Still a halfword to go  */
248	ulhu	t1, (src)
249	PTR_ADDIU	src, 2
250
2511:	beqz	t0, 1f
252	 sll	t1, t1, 16
253
254	lbu	t2, (src)
255	 nop
256
257#ifdef __MIPSEB__
258	sll	t2, t2, 8
259#endif
260	or	t1, t2
261
2621:	ADDC(sum, t1)
263
264	/* fold checksum */
265#ifdef USE_DOUBLE
266	dsll32	v1, sum, 0
267	daddu	sum, v1
268	sltu	v1, sum, v1
269	dsra32	sum, sum, 0
270	addu	sum, v1
271#endif
272
273	/* odd buffer alignment? */
274#ifdef CONFIG_CPU_MIPSR2
275	wsbh	v1, sum
276	movn	sum, v1, t7
277#else
278	beqz	t7, 1f			/* odd buffer alignment? */
279	 lui	v1, 0x00ff
280	addu	v1, 0x00ff
281	and	t0, sum, v1
282	sll	t0, t0, 8
283	srl	sum, sum, 8
284	and	sum, sum, v1
285	or	sum, sum, t0
2861:
287#endif
288	.set	reorder
289	/* Add the passed partial csum.	 */
290	ADDC32(sum, a2)
291	jr	ra
292	.set	noreorder
293	END(csum_partial)
294
295
296/*
297 * checksum and copy routines based on memcpy.S
298 *
299 *	csum_partial_copy_nocheck(src, dst, len, sum)
300 *	__csum_partial_copy_kernel(src, dst, len, sum, errp)
301 *
302 * See "Spec" in memcpy.S for details.	Unlike __copy_user, all
303 * function in this file use the standard calling convention.
304 */
305
306#define src a0
307#define dst a1
308#define len a2
309#define psum a3
310#define sum v0
311#define odd t8
312#define errptr t9
313
314/*
315 * The exception handler for loads requires that:
316 *  1- AT contain the address of the byte just past the end of the source
317 *     of the copy,
318 *  2- src_entry <= src < AT, and
319 *  3- (dst - src) == (dst_entry - src_entry),
320 * The _entry suffix denotes values when __copy_user was called.
321 *
322 * (1) is set up up by __csum_partial_copy_from_user and maintained by
323 *	not writing AT in __csum_partial_copy
324 * (2) is met by incrementing src by the number of bytes copied
325 * (3) is met by not doing loads between a pair of increments of dst and src
326 *
327 * The exception handlers for stores stores -EFAULT to errptr and return.
328 * These handlers do not need to overwrite any data.
329 */
330
331/* Instruction type */
332#define LD_INSN 1
333#define ST_INSN 2
334#define LEGACY_MODE 1
335#define EVA_MODE    2
336#define USEROP   1
337#define KERNELOP 2
338
339/*
340 * Wrapper to add an entry in the exception table
341 * in case the insn causes a memory exception.
342 * Arguments:
343 * insn    : Load/store instruction
344 * type    : Instruction type
345 * reg     : Register
346 * addr    : Address
347 * handler : Exception handler
348 */
349#define EXC(insn, type, reg, addr, handler)	\
350	.if \mode == LEGACY_MODE;		\
3519:		insn reg, addr;			\
352		.section __ex_table,"a";	\
353		PTR	9b, handler;		\
354		.previous;			\
355	/* This is enabled in EVA mode */	\
356	.else;					\
357		/* If loading from user or storing to user */	\
358		.if ((\from == USEROP) && (type == LD_INSN)) || \
359		    ((\to == USEROP) && (type == ST_INSN));	\
3609:			__BUILD_EVA_INSN(insn##e, reg, addr);	\
361			.section __ex_table,"a";		\
362			PTR	9b, handler;			\
363			.previous;				\
364		.else;						\
365			/* EVA without exception */		\
366			insn reg, addr;				\
367		.endif;						\
368	.endif
369
370#undef LOAD
371
372#ifdef USE_DOUBLE
373
374#define LOADK	ld /* No exception */
375#define LOAD(reg, addr, handler)	EXC(ld, LD_INSN, reg, addr, handler)
376#define LOADBU(reg, addr, handler)	EXC(lbu, LD_INSN, reg, addr, handler)
377#define LOADL(reg, addr, handler)	EXC(ldl, LD_INSN, reg, addr, handler)
378#define LOADR(reg, addr, handler)	EXC(ldr, LD_INSN, reg, addr, handler)
379#define STOREB(reg, addr, handler)	EXC(sb, ST_INSN, reg, addr, handler)
380#define STOREL(reg, addr, handler)	EXC(sdl, ST_INSN, reg, addr, handler)
381#define STORER(reg, addr, handler)	EXC(sdr, ST_INSN, reg, addr, handler)
382#define STORE(reg, addr, handler)	EXC(sd, ST_INSN, reg, addr, handler)
383#define ADD    daddu
384#define SUB    dsubu
385#define SRL    dsrl
386#define SLL    dsll
387#define SLLV   dsllv
388#define SRLV   dsrlv
389#define NBYTES 8
390#define LOG_NBYTES 3
391
392#else
393
394#define LOADK	lw /* No exception */
395#define LOAD(reg, addr, handler)	EXC(lw, LD_INSN, reg, addr, handler)
396#define LOADBU(reg, addr, handler)	EXC(lbu, LD_INSN, reg, addr, handler)
397#define LOADL(reg, addr, handler)	EXC(lwl, LD_INSN, reg, addr, handler)
398#define LOADR(reg, addr, handler)	EXC(lwr, LD_INSN, reg, addr, handler)
399#define STOREB(reg, addr, handler)	EXC(sb, ST_INSN, reg, addr, handler)
400#define STOREL(reg, addr, handler)	EXC(swl, ST_INSN, reg, addr, handler)
401#define STORER(reg, addr, handler)	EXC(swr, ST_INSN, reg, addr, handler)
402#define STORE(reg, addr, handler)	EXC(sw, ST_INSN, reg, addr, handler)
403#define ADD    addu
404#define SUB    subu
405#define SRL    srl
406#define SLL    sll
407#define SLLV   sllv
408#define SRLV   srlv
409#define NBYTES 4
410#define LOG_NBYTES 2
411
412#endif /* USE_DOUBLE */
413
414#ifdef CONFIG_CPU_LITTLE_ENDIAN
415#define LDFIRST LOADR
416#define LDREST	LOADL
417#define STFIRST STORER
418#define STREST	STOREL
419#define SHIFT_DISCARD SLLV
420#define SHIFT_DISCARD_REVERT SRLV
421#else
422#define LDFIRST LOADL
423#define LDREST	LOADR
424#define STFIRST STOREL
425#define STREST	STORER
426#define SHIFT_DISCARD SRLV
427#define SHIFT_DISCARD_REVERT SLLV
428#endif
429
430#define FIRST(unit) ((unit)*NBYTES)
431#define REST(unit)  (FIRST(unit)+NBYTES-1)
432
433#define ADDRMASK (NBYTES-1)
434
435#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
436	.set	noat
437#else
438	.set	at=v1
439#endif
440
441	.macro __BUILD_CSUM_PARTIAL_COPY_USER mode, from, to, __nocheck
442
443	PTR_ADDU	AT, src, len	/* See (1) above. */
444	/* initialize __nocheck if this the first time we execute this
445	 * macro
446	 */
447#ifdef CONFIG_64BIT
448	move	errptr, a4
449#else
450	lw	errptr, 16(sp)
451#endif
452	.if \__nocheck == 1
453	FEXPORT(csum_partial_copy_nocheck)
454	.endif
455	move	sum, zero
456	move	odd, zero
457	/*
458	 * Note: dst & src may be unaligned, len may be 0
459	 * Temps
460	 */
461	/*
462	 * The "issue break"s below are very approximate.
463	 * Issue delays for dcache fills will perturb the schedule, as will
464	 * load queue full replay traps, etc.
465	 *
466	 * If len < NBYTES use byte operations.
467	 */
468	sltu	t2, len, NBYTES
469	and	t1, dst, ADDRMASK
470	bnez	t2, .Lcopy_bytes_checklen\@
471	 and	t0, src, ADDRMASK
472	andi	odd, dst, 0x1			/* odd buffer? */
473	bnez	t1, .Ldst_unaligned\@
474	 nop
475	bnez	t0, .Lsrc_unaligned_dst_aligned\@
476	/*
477	 * use delay slot for fall-through
478	 * src and dst are aligned; need to compute rem
479	 */
480.Lboth_aligned\@:
481	 SRL	t0, len, LOG_NBYTES+3	 # +3 for 8 units/iter
482	beqz	t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
483	 nop
484	SUB	len, 8*NBYTES		# subtract here for bgez loop
485	.align	4
4861:
487	LOAD(t0, UNIT(0)(src), .Ll_exc\@)
488	LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
489	LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
490	LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
491	LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)
492	LOAD(t5, UNIT(5)(src), .Ll_exc_copy\@)
493	LOAD(t6, UNIT(6)(src), .Ll_exc_copy\@)
494	LOAD(t7, UNIT(7)(src), .Ll_exc_copy\@)
495	SUB	len, len, 8*NBYTES
496	ADD	src, src, 8*NBYTES
497	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
498	ADDC(sum, t0)
499	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
500	ADDC(sum, t1)
501	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
502	ADDC(sum, t2)
503	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
504	ADDC(sum, t3)
505	STORE(t4, UNIT(4)(dst),	.Ls_exc\@)
506	ADDC(sum, t4)
507	STORE(t5, UNIT(5)(dst),	.Ls_exc\@)
508	ADDC(sum, t5)
509	STORE(t6, UNIT(6)(dst),	.Ls_exc\@)
510	ADDC(sum, t6)
511	STORE(t7, UNIT(7)(dst),	.Ls_exc\@)
512	ADDC(sum, t7)
513	.set	reorder				/* DADDI_WAR */
514	ADD	dst, dst, 8*NBYTES
515	bgez	len, 1b
516	.set	noreorder
517	ADD	len, 8*NBYTES		# revert len (see above)
518
519	/*
520	 * len == the number of bytes left to copy < 8*NBYTES
521	 */
522.Lcleanup_both_aligned\@:
523#define rem t7
524	beqz	len, .Ldone\@
525	 sltu	t0, len, 4*NBYTES
526	bnez	t0, .Lless_than_4units\@
527	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
528	/*
529	 * len >= 4*NBYTES
530	 */
531	LOAD(t0, UNIT(0)(src), .Ll_exc\@)
532	LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
533	LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
534	LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
535	SUB	len, len, 4*NBYTES
536	ADD	src, src, 4*NBYTES
537	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
538	ADDC(sum, t0)
539	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
540	ADDC(sum, t1)
541	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
542	ADDC(sum, t2)
543	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
544	ADDC(sum, t3)
545	.set	reorder				/* DADDI_WAR */
546	ADD	dst, dst, 4*NBYTES
547	beqz	len, .Ldone\@
548	.set	noreorder
549.Lless_than_4units\@:
550	/*
551	 * rem = len % NBYTES
552	 */
553	beq	rem, len, .Lcopy_bytes\@
554	 nop
5551:
556	LOAD(t0, 0(src), .Ll_exc\@)
557	ADD	src, src, NBYTES
558	SUB	len, len, NBYTES
559	STORE(t0, 0(dst), .Ls_exc\@)
560	ADDC(sum, t0)
561	.set	reorder				/* DADDI_WAR */
562	ADD	dst, dst, NBYTES
563	bne	rem, len, 1b
564	.set	noreorder
565
566	/*
567	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
568	 * A loop would do only a byte at a time with possible branch
569	 * mispredicts.	 Can't do an explicit LOAD dst,mask,or,STORE
570	 * because can't assume read-access to dst.  Instead, use
571	 * STREST dst, which doesn't require read access to dst.
572	 *
573	 * This code should perform better than a simple loop on modern,
574	 * wide-issue mips processors because the code has fewer branches and
575	 * more instruction-level parallelism.
576	 */
577#define bits t2
578	beqz	len, .Ldone\@
579	 ADD	t1, dst, len	# t1 is just past last byte of dst
580	li	bits, 8*NBYTES
581	SLL	rem, len, 3	# rem = number of bits to keep
582	LOAD(t0, 0(src), .Ll_exc\@)
583	SUB	bits, bits, rem # bits = number of bits to discard
584	SHIFT_DISCARD t0, t0, bits
585	STREST(t0, -1(t1), .Ls_exc\@)
586	SHIFT_DISCARD_REVERT t0, t0, bits
587	.set reorder
588	ADDC(sum, t0)
589	b	.Ldone\@
590	.set noreorder
591.Ldst_unaligned\@:
592	/*
593	 * dst is unaligned
594	 * t0 = src & ADDRMASK
595	 * t1 = dst & ADDRMASK; T1 > 0
596	 * len >= NBYTES
597	 *
598	 * Copy enough bytes to align dst
599	 * Set match = (src and dst have same alignment)
600	 */
601#define match rem
602	LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)
603	ADD	t2, zero, NBYTES
604	LDREST(t3, REST(0)(src), .Ll_exc_copy\@)
605	SUB	t2, t2, t1	# t2 = number of bytes copied
606	xor	match, t0, t1
607	STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)
608	SLL	t4, t1, 3		# t4 = number of bits to discard
609	SHIFT_DISCARD t3, t3, t4
610	/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
611	ADDC(sum, t3)
612	beq	len, t2, .Ldone\@
613	 SUB	len, len, t2
614	ADD	dst, dst, t2
615	beqz	match, .Lboth_aligned\@
616	 ADD	src, src, t2
617
618.Lsrc_unaligned_dst_aligned\@:
619	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
620	beqz	t0, .Lcleanup_src_unaligned\@
621	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
6221:
623/*
624 * Avoid consecutive LD*'s to the same register since some mips
625 * implementations can't issue them in the same cycle.
626 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
627 * are to the same unit (unless src is aligned, but it's not).
628 */
629	LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
630	LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)
631	SUB	len, len, 4*NBYTES
632	LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
633	LDREST(t1, REST(1)(src), .Ll_exc_copy\@)
634	LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)
635	LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)
636	LDREST(t2, REST(2)(src), .Ll_exc_copy\@)
637	LDREST(t3, REST(3)(src), .Ll_exc_copy\@)
638	ADD	src, src, 4*NBYTES
639#ifdef CONFIG_CPU_SB1
640	nop				# improves slotting
641#endif
642	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
643	ADDC(sum, t0)
644	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
645	ADDC(sum, t1)
646	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
647	ADDC(sum, t2)
648	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
649	ADDC(sum, t3)
650	.set	reorder				/* DADDI_WAR */
651	ADD	dst, dst, 4*NBYTES
652	bne	len, rem, 1b
653	.set	noreorder
654
655.Lcleanup_src_unaligned\@:
656	beqz	len, .Ldone\@
657	 and	rem, len, NBYTES-1  # rem = len % NBYTES
658	beq	rem, len, .Lcopy_bytes\@
659	 nop
6601:
661	LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
662	LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
663	ADD	src, src, NBYTES
664	SUB	len, len, NBYTES
665	STORE(t0, 0(dst), .Ls_exc\@)
666	ADDC(sum, t0)
667	.set	reorder				/* DADDI_WAR */
668	ADD	dst, dst, NBYTES
669	bne	len, rem, 1b
670	.set	noreorder
671
672.Lcopy_bytes_checklen\@:
673	beqz	len, .Ldone\@
674	 nop
675.Lcopy_bytes\@:
676	/* 0 < len < NBYTES  */
677#ifdef CONFIG_CPU_LITTLE_ENDIAN
678#define SHIFT_START 0
679#define SHIFT_INC 8
680#else
681#define SHIFT_START 8*(NBYTES-1)
682#define SHIFT_INC -8
683#endif
684	move	t2, zero	# partial word
685	li	t3, SHIFT_START # shift
686/* use .Ll_exc_copy here to return correct sum on fault */
687#define COPY_BYTE(N)			\
688	LOADBU(t0, N(src), .Ll_exc_copy\@);	\
689	SUB	len, len, 1;		\
690	STOREB(t0, N(dst), .Ls_exc\@);	\
691	SLLV	t0, t0, t3;		\
692	addu	t3, SHIFT_INC;		\
693	beqz	len, .Lcopy_bytes_done\@; \
694	 or	t2, t0
695
696	COPY_BYTE(0)
697	COPY_BYTE(1)
698#ifdef USE_DOUBLE
699	COPY_BYTE(2)
700	COPY_BYTE(3)
701	COPY_BYTE(4)
702	COPY_BYTE(5)
703#endif
704	LOADBU(t0, NBYTES-2(src), .Ll_exc_copy\@)
705	SUB	len, len, 1
706	STOREB(t0, NBYTES-2(dst), .Ls_exc\@)
707	SLLV	t0, t0, t3
708	or	t2, t0
709.Lcopy_bytes_done\@:
710	ADDC(sum, t2)
711.Ldone\@:
712	/* fold checksum */
713#ifdef USE_DOUBLE
714	dsll32	v1, sum, 0
715	daddu	sum, v1
716	sltu	v1, sum, v1
717	dsra32	sum, sum, 0
718	addu	sum, v1
719#endif
720
721#ifdef CONFIG_CPU_MIPSR2
722	wsbh	v1, sum
723	movn	sum, v1, odd
724#else
725	beqz	odd, 1f			/* odd buffer alignment? */
726	 lui	v1, 0x00ff
727	addu	v1, 0x00ff
728	and	t0, sum, v1
729	sll	t0, t0, 8
730	srl	sum, sum, 8
731	and	sum, sum, v1
732	or	sum, sum, t0
7331:
734#endif
735	.set reorder
736	ADDC32(sum, psum)
737	jr	ra
738	.set noreorder
739
740.Ll_exc_copy\@:
741	/*
742	 * Copy bytes from src until faulting load address (or until a
743	 * lb faults)
744	 *
745	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
746	 * may be more than a byte beyond the last address.
747	 * Hence, the lb below may get an exception.
748	 *
749	 * Assumes src < THREAD_BUADDR($28)
750	 */
751	LOADK	t0, TI_TASK($28)
752	 li	t2, SHIFT_START
753	LOADK	t0, THREAD_BUADDR(t0)
7541:
755	LOADBU(t1, 0(src), .Ll_exc\@)
756	ADD	src, src, 1
757	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
758	SLLV	t1, t1, t2
759	addu	t2, SHIFT_INC
760	ADDC(sum, t1)
761	.set	reorder				/* DADDI_WAR */
762	ADD	dst, dst, 1
763	bne	src, t0, 1b
764	.set	noreorder
765.Ll_exc\@:
766	LOADK	t0, TI_TASK($28)
767	 nop
768	LOADK	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
769	 nop
770	SUB	len, AT, t0		# len number of uncopied bytes
771	/*
772	 * Here's where we rely on src and dst being incremented in tandem,
773	 *   See (3) above.
774	 * dst += (fault addr - src) to put dst at first byte to clear
775	 */
776	ADD	dst, t0			# compute start address in a1
777	SUB	dst, src
778	/*
779	 * Clear len bytes starting at dst.  Can't call __bzero because it
780	 * might modify len.  An inefficient loop for these rare times...
781	 */
782	.set	reorder				/* DADDI_WAR */
783	SUB	src, len, 1
784	beqz	len, .Ldone\@
785	.set	noreorder
7861:	sb	zero, 0(dst)
787	ADD	dst, dst, 1
788	.set	push
789	.set	noat
790#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
791	bnez	src, 1b
792	 SUB	src, src, 1
793#else
794	li	v1, 1
795	bnez	src, 1b
796	 SUB	src, src, v1
797#endif
798	li	v1, -EFAULT
799	b	.Ldone\@
800	 sw	v1, (errptr)
801
802.Ls_exc\@:
803	li	v0, -1 /* invalid checksum */
804	li	v1, -EFAULT
805	jr	ra
806	 sw	v1, (errptr)
807	.set	pop
808	.endm
809
810LEAF(__csum_partial_copy_kernel)
811#ifndef CONFIG_EVA
812FEXPORT(__csum_partial_copy_to_user)
813FEXPORT(__csum_partial_copy_from_user)
814#endif
815__BUILD_CSUM_PARTIAL_COPY_USER LEGACY_MODE USEROP USEROP 1
816END(__csum_partial_copy_kernel)
817
818#ifdef CONFIG_EVA
819LEAF(__csum_partial_copy_to_user)
820__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE KERNELOP USEROP 0
821END(__csum_partial_copy_to_user)
822
823LEAF(__csum_partial_copy_from_user)
824__BUILD_CSUM_PARTIAL_COPY_USER EVA_MODE USEROP KERNELOP 0
825END(__csum_partial_copy_from_user)
826#endif
827