xref: /openbmc/linux/arch/mips/lib/csum_partial.S (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License.  See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Quick'n'dirty IP checksum ...
7 *
8 * Copyright (C) 1998, 1999 Ralf Baechle
9 * Copyright (C) 1999 Silicon Graphics, Inc.
10 */
11#include <linux/errno.h>
12#include <asm/asm.h>
13#include <asm/asm-offsets.h>
14#include <asm/regdef.h>
15
16#ifdef CONFIG_64BIT
17/*
18 * As we are sharing code base with the mips32 tree (which use the o32 ABI
19 * register definitions). We need to redefine the register definitions from
20 * the n64 ABI register naming to the o32 ABI register naming.
21 */
22#undef t0
23#undef t1
24#undef t2
25#undef t3
26#define t0	$8
27#define t1	$9
28#define t2	$10
29#define t3	$11
30#define t4	$12
31#define t5	$13
32#define t6	$14
33#define t7	$15
34
35#define USE_DOUBLE
36#endif
37
38#ifdef USE_DOUBLE
39
40#define LOAD   ld
41#define ADD    daddu
42#define NBYTES 8
43
44#else
45
46#define LOAD   lw
47#define ADD    addu
48#define NBYTES 4
49
50#endif /* USE_DOUBLE */
51
52#define UNIT(unit)  ((unit)*NBYTES)
53
54#define ADDC(sum,reg)						\
55	ADD	sum, reg;					\
56	sltu	v1, sum, reg;					\
57	ADD	sum, v1
58
59#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
60	LOAD	_t0, (offset + UNIT(0))(src);			\
61	LOAD	_t1, (offset + UNIT(1))(src);			\
62	LOAD	_t2, (offset + UNIT(2))(src); 			\
63	LOAD	_t3, (offset + UNIT(3))(src); 			\
64	ADDC(sum, _t0);						\
65	ADDC(sum, _t1);						\
66	ADDC(sum, _t2);						\
67	ADDC(sum, _t3)
68
69#ifdef USE_DOUBLE
70#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
71	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
72#else
73#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
74	CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);	\
75	CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
76#endif
77
78/*
79 * a0: source address
80 * a1: length of the area to checksum
81 * a2: partial checksum
82 */
83
84#define src a0
85#define sum v0
86
87	.text
88	.set	noreorder
89	.align	5
90LEAF(csum_partial)
91	move	sum, zero
92	move	t7, zero
93
94	sltiu	t8, a1, 0x8
95	bnez	t8, small_csumcpy		/* < 8 bytes to copy */
96	 move	t2, a1
97
98	andi	t7, src, 0x1			/* odd buffer? */
99
100hword_align:
101	beqz	t7, word_align
102	 andi	t8, src, 0x2
103
104	lbu	t0, (src)
105	LONG_SUBU	a1, a1, 0x1
106#ifdef __MIPSEL__
107	sll	t0, t0, 8
108#endif
109	ADDC(sum, t0)
110	PTR_ADDU	src, src, 0x1
111	andi	t8, src, 0x2
112
113word_align:
114	beqz	t8, dword_align
115	 sltiu	t8, a1, 56
116
117	lhu	t0, (src)
118	LONG_SUBU	a1, a1, 0x2
119	ADDC(sum, t0)
120	sltiu	t8, a1, 56
121	PTR_ADDU	src, src, 0x2
122
123dword_align:
124	bnez	t8, do_end_words
125	 move	t8, a1
126
127	andi	t8, src, 0x4
128	beqz	t8, qword_align
129	 andi	t8, src, 0x8
130
131	lw	t0, 0x00(src)
132	LONG_SUBU	a1, a1, 0x4
133	ADDC(sum, t0)
134	PTR_ADDU	src, src, 0x4
135	andi	t8, src, 0x8
136
137qword_align:
138	beqz	t8, oword_align
139	 andi	t8, src, 0x10
140
141#ifdef USE_DOUBLE
142	ld	t0, 0x00(src)
143	LONG_SUBU	a1, a1, 0x8
144	ADDC(sum, t0)
145#else
146	lw	t0, 0x00(src)
147	lw	t1, 0x04(src)
148	LONG_SUBU	a1, a1, 0x8
149	ADDC(sum, t0)
150	ADDC(sum, t1)
151#endif
152	PTR_ADDU	src, src, 0x8
153	andi	t8, src, 0x10
154
155oword_align:
156	beqz	t8, begin_movement
157	 LONG_SRL	t8, a1, 0x7
158
159#ifdef USE_DOUBLE
160	ld	t0, 0x00(src)
161	ld	t1, 0x08(src)
162	ADDC(sum, t0)
163	ADDC(sum, t1)
164#else
165	CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
166#endif
167	LONG_SUBU	a1, a1, 0x10
168	PTR_ADDU	src, src, 0x10
169	LONG_SRL	t8, a1, 0x7
170
171begin_movement:
172	beqz	t8, 1f
173	 andi	t2, a1, 0x40
174
175move_128bytes:
176	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
177	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
178	CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
179	CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
180	LONG_SUBU	t8, t8, 0x01
181	bnez	t8, move_128bytes
182	 PTR_ADDU	src, src, 0x80
183
1841:
185	beqz	t2, 1f
186	 andi	t2, a1, 0x20
187
188move_64bytes:
189	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
190	CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
191	PTR_ADDU	src, src, 0x40
192
1931:
194	beqz	t2, do_end_words
195	 andi	t8, a1, 0x1c
196
197move_32bytes:
198	CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
199	andi	t8, a1, 0x1c
200	PTR_ADDU	src, src, 0x20
201
202do_end_words:
203	beqz	t8, small_csumcpy
204	 andi	t2, a1, 0x3
205	LONG_SRL	t8, t8, 0x2
206
207end_words:
208	lw	t0, (src)
209	LONG_SUBU	t8, t8, 0x1
210	ADDC(sum, t0)
211	bnez	t8, end_words
212	 PTR_ADDU	src, src, 0x4
213
214/* unknown src alignment and < 8 bytes to go  */
215small_csumcpy:
216	move	a1, t2
217
218	andi	t0, a1, 4
219	beqz	t0, 1f
220	 andi	t0, a1, 2
221
222	/* Still a full word to go  */
223	ulw	t1, (src)
224	PTR_ADDIU	src, 4
225	ADDC(sum, t1)
226
2271:	move	t1, zero
228	beqz	t0, 1f
229	 andi	t0, a1, 1
230
231	/* Still a halfword to go  */
232	ulhu	t1, (src)
233	PTR_ADDIU	src, 2
234
2351:	beqz	t0, 1f
236	 sll	t1, t1, 16
237
238	lbu	t2, (src)
239	 nop
240
241#ifdef __MIPSEB__
242	sll	t2, t2, 8
243#endif
244	or	t1, t2
245
2461:	ADDC(sum, t1)
247
248	/* fold checksum */
249#ifdef USE_DOUBLE
250	dsll32	v1, sum, 0
251	daddu	sum, v1
252	sltu	v1, sum, v1
253	dsra32	sum, sum, 0
254	addu	sum, v1
255#endif
256	sll	v1, sum, 16
257	addu	sum, v1
258	sltu	v1, sum, v1
259	srl	sum, sum, 16
260	addu	sum, v1
261
262	/* odd buffer alignment? */
263	beqz	t7, 1f
264	 nop
265	sll	v1, sum, 8
266	srl	sum, sum, 8
267	or	sum, v1
268	andi	sum, 0xffff
2691:
270	.set	reorder
271	/* Add the passed partial csum.  */
272	ADDC(sum, a2)
273	jr	ra
274	.set	noreorder
275	END(csum_partial)
276
277
278/*
279 * checksum and copy routines based on memcpy.S
280 *
281 *	csum_partial_copy_nocheck(src, dst, len, sum)
282 *	__csum_partial_copy_user(src, dst, len, sum, errp)
283 *
284 * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
285 * function in this file use the standard calling convention.
286 */
287
288#define src a0
289#define dst a1
290#define len a2
291#define psum a3
292#define sum v0
293#define odd t8
294#define errptr t9
295
296/*
297 * The exception handler for loads requires that:
298 *  1- AT contain the address of the byte just past the end of the source
299 *     of the copy,
300 *  2- src_entry <= src < AT, and
301 *  3- (dst - src) == (dst_entry - src_entry),
302 * The _entry suffix denotes values when __copy_user was called.
303 *
304 * (1) is set up up by __csum_partial_copy_from_user and maintained by
305 *	not writing AT in __csum_partial_copy
306 * (2) is met by incrementing src by the number of bytes copied
307 * (3) is met by not doing loads between a pair of increments of dst and src
308 *
309 * The exception handlers for stores stores -EFAULT to errptr and return.
310 * These handlers do not need to overwrite any data.
311 */
312
313#define EXC(inst_reg,addr,handler)		\
3149:	inst_reg, addr;				\
315	.section __ex_table,"a";		\
316	PTR	9b, handler;			\
317	.previous
318
319#ifdef USE_DOUBLE
320
321#define LOAD   ld
322#define LOADL  ldl
323#define LOADR  ldr
324#define STOREL sdl
325#define STORER sdr
326#define STORE  sd
327#define ADD    daddu
328#define SUB    dsubu
329#define SRL    dsrl
330#define SLL    dsll
331#define SLLV   dsllv
332#define SRLV   dsrlv
333#define NBYTES 8
334#define LOG_NBYTES 3
335
336#else
337
338#define LOAD   lw
339#define LOADL  lwl
340#define LOADR  lwr
341#define STOREL swl
342#define STORER swr
343#define STORE  sw
344#define ADD    addu
345#define SUB    subu
346#define SRL    srl
347#define SLL    sll
348#define SLLV   sllv
349#define SRLV   srlv
350#define NBYTES 4
351#define LOG_NBYTES 2
352
353#endif /* USE_DOUBLE */
354
355#ifdef CONFIG_CPU_LITTLE_ENDIAN
356#define LDFIRST LOADR
357#define LDREST  LOADL
358#define STFIRST STORER
359#define STREST  STOREL
360#define SHIFT_DISCARD SLLV
361#define SHIFT_DISCARD_REVERT SRLV
362#else
363#define LDFIRST LOADL
364#define LDREST  LOADR
365#define STFIRST STOREL
366#define STREST  STORER
367#define SHIFT_DISCARD SRLV
368#define SHIFT_DISCARD_REVERT SLLV
369#endif
370
371#define FIRST(unit) ((unit)*NBYTES)
372#define REST(unit)  (FIRST(unit)+NBYTES-1)
373
374#define ADDRMASK (NBYTES-1)
375
376	.set	noat
377
378LEAF(__csum_partial_copy_user)
379	PTR_ADDU	AT, src, len	/* See (1) above. */
380#ifdef CONFIG_64BIT
381	move	errptr, a4
382#else
383	lw	errptr, 16(sp)
384#endif
385FEXPORT(csum_partial_copy_nocheck)
386	move	sum, zero
387	move	odd, zero
388	/*
389	 * Note: dst & src may be unaligned, len may be 0
390	 * Temps
391	 */
392	/*
393	 * The "issue break"s below are very approximate.
394	 * Issue delays for dcache fills will perturb the schedule, as will
395	 * load queue full replay traps, etc.
396	 *
397	 * If len < NBYTES use byte operations.
398	 */
399	sltu	t2, len, NBYTES
400	and	t1, dst, ADDRMASK
401	bnez	t2, copy_bytes_checklen
402	 and	t0, src, ADDRMASK
403	andi	odd, dst, 0x1			/* odd buffer? */
404	bnez	t1, dst_unaligned
405	 nop
406	bnez	t0, src_unaligned_dst_aligned
407	/*
408	 * use delay slot for fall-through
409	 * src and dst are aligned; need to compute rem
410	 */
411both_aligned:
412	 SRL	t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
413	beqz	t0, cleanup_both_aligned # len < 8*NBYTES
414	 nop
415	SUB	len, 8*NBYTES		# subtract here for bgez loop
416	.align	4
4171:
418EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
419EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
420EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
421EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
422EXC(	LOAD	t4, UNIT(4)(src),	l_exc_copy)
423EXC(	LOAD	t5, UNIT(5)(src),	l_exc_copy)
424EXC(	LOAD	t6, UNIT(6)(src),	l_exc_copy)
425EXC(	LOAD	t7, UNIT(7)(src),	l_exc_copy)
426	SUB	len, len, 8*NBYTES
427	ADD	src, src, 8*NBYTES
428EXC(	STORE	t0, UNIT(0)(dst),	s_exc)
429	ADDC(sum, t0)
430EXC(	STORE	t1, UNIT(1)(dst),	s_exc)
431	ADDC(sum, t1)
432EXC(	STORE	t2, UNIT(2)(dst),	s_exc)
433	ADDC(sum, t2)
434EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
435	ADDC(sum, t3)
436EXC(	STORE	t4, UNIT(4)(dst),	s_exc)
437	ADDC(sum, t4)
438EXC(	STORE	t5, UNIT(5)(dst),	s_exc)
439	ADDC(sum, t5)
440EXC(	STORE	t6, UNIT(6)(dst),	s_exc)
441	ADDC(sum, t6)
442EXC(	STORE	t7, UNIT(7)(dst),	s_exc)
443	ADDC(sum, t7)
444	bgez	len, 1b
445	 ADD	dst, dst, 8*NBYTES
446	ADD	len, 8*NBYTES		# revert len (see above)
447
448	/*
449	 * len == the number of bytes left to copy < 8*NBYTES
450	 */
451cleanup_both_aligned:
452#define rem t7
453	beqz	len, done
454	 sltu	t0, len, 4*NBYTES
455	bnez	t0, less_than_4units
456	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
457	/*
458	 * len >= 4*NBYTES
459	 */
460EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
461EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
462EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
463EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
464	SUB	len, len, 4*NBYTES
465	ADD	src, src, 4*NBYTES
466EXC(	STORE	t0, UNIT(0)(dst),	s_exc)
467	ADDC(sum, t0)
468EXC(	STORE	t1, UNIT(1)(dst),	s_exc)
469	ADDC(sum, t1)
470EXC(	STORE	t2, UNIT(2)(dst),	s_exc)
471	ADDC(sum, t2)
472EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
473	ADDC(sum, t3)
474	beqz	len, done
475	 ADD	dst, dst, 4*NBYTES
476less_than_4units:
477	/*
478	 * rem = len % NBYTES
479	 */
480	beq	rem, len, copy_bytes
481	 nop
4821:
483EXC(	LOAD	t0, 0(src),		l_exc)
484	ADD	src, src, NBYTES
485	SUB	len, len, NBYTES
486EXC(	STORE	t0, 0(dst),		s_exc)
487	ADDC(sum, t0)
488	bne	rem, len, 1b
489	 ADD	dst, dst, NBYTES
490
491	/*
492	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
493	 * A loop would do only a byte at a time with possible branch
494	 * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
495	 * because can't assume read-access to dst.  Instead, use
496	 * STREST dst, which doesn't require read access to dst.
497	 *
498	 * This code should perform better than a simple loop on modern,
499	 * wide-issue mips processors because the code has fewer branches and
500	 * more instruction-level parallelism.
501	 */
502#define bits t2
503	beqz	len, done
504	 ADD	t1, dst, len	# t1 is just past last byte of dst
505	li	bits, 8*NBYTES
506	SLL	rem, len, 3	# rem = number of bits to keep
507EXC(	LOAD	t0, 0(src),		l_exc)
508	SUB	bits, bits, rem	# bits = number of bits to discard
509	SHIFT_DISCARD t0, t0, bits
510EXC(	STREST	t0, -1(t1),		s_exc)
511	SHIFT_DISCARD_REVERT t0, t0, bits
512	.set reorder
513	ADDC(sum, t0)
514	b	done
515	.set noreorder
516dst_unaligned:
517	/*
518	 * dst is unaligned
519	 * t0 = src & ADDRMASK
520	 * t1 = dst & ADDRMASK; T1 > 0
521	 * len >= NBYTES
522	 *
523	 * Copy enough bytes to align dst
524	 * Set match = (src and dst have same alignment)
525	 */
526#define match rem
527EXC(	LDFIRST	t3, FIRST(0)(src),	l_exc)
528	ADD	t2, zero, NBYTES
529EXC(	LDREST	t3, REST(0)(src),	l_exc_copy)
530	SUB	t2, t2, t1	# t2 = number of bytes copied
531	xor	match, t0, t1
532EXC(	STFIRST t3, FIRST(0)(dst),	s_exc)
533	SLL	t4, t1, 3		# t4 = number of bits to discard
534	SHIFT_DISCARD t3, t3, t4
535	/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
536	ADDC(sum, t3)
537	beq	len, t2, done
538	 SUB	len, len, t2
539	ADD	dst, dst, t2
540	beqz	match, both_aligned
541	 ADD	src, src, t2
542
543src_unaligned_dst_aligned:
544	SRL	t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
545	beqz	t0, cleanup_src_unaligned
546	 and	rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
5471:
548/*
549 * Avoid consecutive LD*'s to the same register since some mips
550 * implementations can't issue them in the same cycle.
551 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
552 * are to the same unit (unless src is aligned, but it's not).
553 */
554EXC(	LDFIRST	t0, FIRST(0)(src),	l_exc)
555EXC(	LDFIRST	t1, FIRST(1)(src),	l_exc_copy)
556	SUB     len, len, 4*NBYTES
557EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
558EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
559EXC(	LDFIRST	t2, FIRST(2)(src),	l_exc_copy)
560EXC(	LDFIRST	t3, FIRST(3)(src),	l_exc_copy)
561EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
562EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
563	ADD	src, src, 4*NBYTES
564#ifdef CONFIG_CPU_SB1
565	nop				# improves slotting
566#endif
567EXC(	STORE	t0, UNIT(0)(dst),	s_exc)
568	ADDC(sum, t0)
569EXC(	STORE	t1, UNIT(1)(dst),	s_exc)
570	ADDC(sum, t1)
571EXC(	STORE	t2, UNIT(2)(dst),	s_exc)
572	ADDC(sum, t2)
573EXC(	STORE	t3, UNIT(3)(dst),	s_exc)
574	ADDC(sum, t3)
575	bne	len, rem, 1b
576	 ADD	dst, dst, 4*NBYTES
577
578cleanup_src_unaligned:
579	beqz	len, done
580	 and	rem, len, NBYTES-1  # rem = len % NBYTES
581	beq	rem, len, copy_bytes
582	 nop
5831:
584EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
585EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
586	ADD	src, src, NBYTES
587	SUB	len, len, NBYTES
588EXC(	STORE	t0, 0(dst),		s_exc)
589	ADDC(sum, t0)
590	bne	len, rem, 1b
591	 ADD	dst, dst, NBYTES
592
593copy_bytes_checklen:
594	beqz	len, done
595	 nop
596copy_bytes:
597	/* 0 < len < NBYTES  */
598#ifdef CONFIG_CPU_LITTLE_ENDIAN
599#define SHIFT_START 0
600#define SHIFT_INC 8
601#else
602#define SHIFT_START 8*(NBYTES-1)
603#define SHIFT_INC -8
604#endif
605	move	t2, zero	# partial word
606	li	t3, SHIFT_START	# shift
607/* use l_exc_copy here to return correct sum on fault */
608#define COPY_BYTE(N)			\
609EXC(	lbu	t0, N(src), l_exc_copy);	\
610	SUB	len, len, 1;		\
611EXC(	sb	t0, N(dst), s_exc);	\
612	SLLV	t0, t0, t3;		\
613	addu	t3, SHIFT_INC;		\
614	beqz	len, copy_bytes_done;	\
615	 or	t2, t0
616
617	COPY_BYTE(0)
618	COPY_BYTE(1)
619#ifdef USE_DOUBLE
620	COPY_BYTE(2)
621	COPY_BYTE(3)
622	COPY_BYTE(4)
623	COPY_BYTE(5)
624#endif
625EXC(	lbu	t0, NBYTES-2(src), l_exc_copy)
626	SUB	len, len, 1
627EXC(	sb	t0, NBYTES-2(dst), s_exc)
628	SLLV	t0, t0, t3
629	or	t2, t0
630copy_bytes_done:
631	ADDC(sum, t2)
632done:
633	/* fold checksum */
634#ifdef USE_DOUBLE
635	dsll32	v1, sum, 0
636	daddu	sum, v1
637	sltu	v1, sum, v1
638	dsra32	sum, sum, 0
639	addu	sum, v1
640#endif
641	sll	v1, sum, 16
642	addu	sum, v1
643	sltu	v1, sum, v1
644	srl	sum, sum, 16
645	addu	sum, v1
646
647	/* odd buffer alignment? */
648	beqz	odd, 1f
649	 nop
650	sll	v1, sum, 8
651	srl	sum, sum, 8
652	or	sum, v1
653	andi	sum, 0xffff
6541:
655	.set reorder
656	ADDC(sum, psum)
657	jr	ra
658	.set noreorder
659
660l_exc_copy:
661	/*
662	 * Copy bytes from src until faulting load address (or until a
663	 * lb faults)
664	 *
665	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
666	 * may be more than a byte beyond the last address.
667	 * Hence, the lb below may get an exception.
668	 *
669	 * Assumes src < THREAD_BUADDR($28)
670	 */
671	LOAD	t0, TI_TASK($28)
672	 li	t2, SHIFT_START
673	LOAD	t0, THREAD_BUADDR(t0)
6741:
675EXC(	lbu	t1, 0(src),	l_exc)
676	ADD	src, src, 1
677	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
678	SLLV	t1, t1, t2
679	addu	t2, SHIFT_INC
680	ADDC(sum, t1)
681	bne	src, t0, 1b
682	 ADD	dst, dst, 1
683l_exc:
684	LOAD	t0, TI_TASK($28)
685	 nop
686	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
687	 nop
688	SUB	len, AT, t0		# len number of uncopied bytes
689	/*
690	 * Here's where we rely on src and dst being incremented in tandem,
691	 *   See (3) above.
692	 * dst += (fault addr - src) to put dst at first byte to clear
693	 */
694	ADD	dst, t0			# compute start address in a1
695	SUB	dst, src
696	/*
697	 * Clear len bytes starting at dst.  Can't call __bzero because it
698	 * might modify len.  An inefficient loop for these rare times...
699	 */
700	beqz	len, done
701	 SUB	src, len, 1
7021:	sb	zero, 0(dst)
703	ADD	dst, dst, 1
704	bnez	src, 1b
705	 SUB	src, src, 1
706	li	v1, -EFAULT
707	b	done
708	 sw	v1, (errptr)
709
710s_exc:
711	li	v0, -1 /* invalid checksum */
712	li	v1, -EFAULT
713	jr	ra
714	 sw	v1, (errptr)
715	END(__csum_partial_copy_user)
716