1/* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * Unified implementation of memcpy, memmove and the __copy_user backend. 7 * 8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org) 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 10 * Copyright (C) 2002 Broadcom, Inc. 11 * memcpy/copy_user author: Mark Vandevoorde 12 * 13 * Mnemonic names for arguments to memcpy/__copy_user 14 */ 15#include <linux/config.h> 16#include <asm/asm.h> 17#include <asm/offset.h> 18#include <asm/regdef.h> 19 20#define dst a0 21#define src a1 22#define len a2 23 24/* 25 * Spec 26 * 27 * memcpy copies len bytes from src to dst and sets v0 to dst. 28 * It assumes that 29 * - src and dst don't overlap 30 * - src is readable 31 * - dst is writable 32 * memcpy uses the standard calling convention 33 * 34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to 35 * the number of uncopied bytes due to an exception caused by a read or write. 36 * __copy_user assumes that src and dst don't overlap, and that the call is 37 * implementing one of the following: 38 * copy_to_user 39 * - src is readable (no exceptions when reading src) 40 * copy_from_user 41 * - dst is writable (no exceptions when writing dst) 42 * __copy_user uses a non-standard calling convention; see 43 * include/asm-mips/uaccess.h 44 * 45 * When an exception happens on a load, the handler must 46 # ensure that all of the destination buffer is overwritten to prevent 47 * leaking information to user mode programs. 48 */ 49 50/* 51 * Implementation 52 */ 53 54/* 55 * The exception handler for loads requires that: 56 * 1- AT contain the address of the byte just past the end of the source 57 * of the copy, 58 * 2- src_entry <= src < AT, and 59 * 3- (dst - src) == (dst_entry - src_entry), 60 * The _entry suffix denotes values when __copy_user was called. 61 * 62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user 63 * (2) is met by incrementing src by the number of bytes copied 64 * (3) is met by not doing loads between a pair of increments of dst and src 65 * 66 * The exception handlers for stores adjust len (if necessary) and return. 67 * These handlers do not need to overwrite any data. 68 * 69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore 70 * they're not protected. 71 */ 72 73#define EXC(inst_reg,addr,handler) \ 749: inst_reg, addr; \ 75 .section __ex_table,"a"; \ 76 PTR 9b, handler; \ 77 .previous 78 79/* 80 * Only on the 64-bit kernel we can made use of 64-bit registers. 81 */ 82#ifdef CONFIG_MIPS64 83#define USE_DOUBLE 84#endif 85 86#ifdef USE_DOUBLE 87 88#define LOAD ld 89#define LOADL ldl 90#define LOADR ldr 91#define STOREL sdl 92#define STORER sdr 93#define STORE sd 94#define ADD daddu 95#define SUB dsubu 96#define SRL dsrl 97#define SRA dsra 98#define SLL dsll 99#define SLLV dsllv 100#define SRLV dsrlv 101#define NBYTES 8 102#define LOG_NBYTES 3 103 104/* 105 * As we are sharing code base with the mips32 tree (which use the o32 ABI 106 * register definitions). We need to redefine the register definitions from 107 * the n64 ABI register naming to the o32 ABI register naming. 108 */ 109#undef t0 110#undef t1 111#undef t2 112#undef t3 113#define t0 $8 114#define t1 $9 115#define t2 $10 116#define t3 $11 117#define t4 $12 118#define t5 $13 119#define t6 $14 120#define t7 $15 121 122#else 123 124#define LOAD lw 125#define LOADL lwl 126#define LOADR lwr 127#define STOREL swl 128#define STORER swr 129#define STORE sw 130#define ADD addu 131#define SUB subu 132#define SRL srl 133#define SLL sll 134#define SRA sra 135#define SLLV sllv 136#define SRLV srlv 137#define NBYTES 4 138#define LOG_NBYTES 2 139 140#endif /* USE_DOUBLE */ 141 142#ifdef CONFIG_CPU_LITTLE_ENDIAN 143#define LDFIRST LOADR 144#define LDREST LOADL 145#define STFIRST STORER 146#define STREST STOREL 147#define SHIFT_DISCARD SLLV 148#else 149#define LDFIRST LOADL 150#define LDREST LOADR 151#define STFIRST STOREL 152#define STREST STORER 153#define SHIFT_DISCARD SRLV 154#endif 155 156#define FIRST(unit) ((unit)*NBYTES) 157#define REST(unit) (FIRST(unit)+NBYTES-1) 158#define UNIT(unit) FIRST(unit) 159 160#define ADDRMASK (NBYTES-1) 161 162 .text 163 .set noreorder 164 .set noat 165 166/* 167 * A combined memcpy/__copy_user 168 * __copy_user sets len to 0 for success; else to an upper bound of 169 * the number of uncopied bytes. 170 * memcpy sets v0 to dst. 171 */ 172 .align 5 173LEAF(memcpy) /* a0=dst a1=src a2=len */ 174 move v0, dst /* return value */ 175__memcpy: 176FEXPORT(__copy_user) 177 /* 178 * Note: dst & src may be unaligned, len may be 0 179 * Temps 180 */ 181#define rem t8 182 183 /* 184 * The "issue break"s below are very approximate. 185 * Issue delays for dcache fills will perturb the schedule, as will 186 * load queue full replay traps, etc. 187 * 188 * If len < NBYTES use byte operations. 189 */ 190 PREF( 0, 0(src) ) 191 PREF( 1, 0(dst) ) 192 sltu t2, len, NBYTES 193 and t1, dst, ADDRMASK 194 PREF( 0, 1*32(src) ) 195 PREF( 1, 1*32(dst) ) 196 bnez t2, copy_bytes_checklen 197 and t0, src, ADDRMASK 198 PREF( 0, 2*32(src) ) 199 PREF( 1, 2*32(dst) ) 200 bnez t1, dst_unaligned 201 nop 202 bnez t0, src_unaligned_dst_aligned 203 /* 204 * use delay slot for fall-through 205 * src and dst are aligned; need to compute rem 206 */ 207both_aligned: 208 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter 209 beqz t0, cleanup_both_aligned # len < 8*NBYTES 210 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) 211 PREF( 0, 3*32(src) ) 212 PREF( 1, 3*32(dst) ) 213 .align 4 2141: 215EXC( LOAD t0, UNIT(0)(src), l_exc) 216EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 217EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 218EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 219 SUB len, len, 8*NBYTES 220EXC( LOAD t4, UNIT(4)(src), l_exc_copy) 221EXC( LOAD t7, UNIT(5)(src), l_exc_copy) 222EXC( STORE t0, UNIT(0)(dst), s_exc_p8u) 223EXC( STORE t1, UNIT(1)(dst), s_exc_p7u) 224EXC( LOAD t0, UNIT(6)(src), l_exc_copy) 225EXC( LOAD t1, UNIT(7)(src), l_exc_copy) 226 ADD src, src, 8*NBYTES 227 ADD dst, dst, 8*NBYTES 228EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u) 229EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u) 230EXC( STORE t4, UNIT(-4)(dst), s_exc_p4u) 231EXC( STORE t7, UNIT(-3)(dst), s_exc_p3u) 232EXC( STORE t0, UNIT(-2)(dst), s_exc_p2u) 233EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u) 234 PREF( 0, 8*32(src) ) 235 PREF( 1, 8*32(dst) ) 236 bne len, rem, 1b 237 nop 238 239 /* 240 * len == rem == the number of bytes left to copy < 8*NBYTES 241 */ 242cleanup_both_aligned: 243 beqz len, done 244 sltu t0, len, 4*NBYTES 245 bnez t0, less_than_4units 246 and rem, len, (NBYTES-1) # rem = len % NBYTES 247 /* 248 * len >= 4*NBYTES 249 */ 250EXC( LOAD t0, UNIT(0)(src), l_exc) 251EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 252EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 253EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 254 SUB len, len, 4*NBYTES 255 ADD src, src, 4*NBYTES 256EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 257EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 258EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 259EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 260 beqz len, done 261 ADD dst, dst, 4*NBYTES 262less_than_4units: 263 /* 264 * rem = len % NBYTES 265 */ 266 beq rem, len, copy_bytes 267 nop 2681: 269EXC( LOAD t0, 0(src), l_exc) 270 ADD src, src, NBYTES 271 SUB len, len, NBYTES 272EXC( STORE t0, 0(dst), s_exc_p1u) 273 bne rem, len, 1b 274 ADD dst, dst, NBYTES 275 276 /* 277 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 278 * A loop would do only a byte at a time with possible branch 279 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE 280 * because can't assume read-access to dst. Instead, use 281 * STREST dst, which doesn't require read access to dst. 282 * 283 * This code should perform better than a simple loop on modern, 284 * wide-issue mips processors because the code has fewer branches and 285 * more instruction-level parallelism. 286 */ 287#define bits t2 288 beqz len, done 289 ADD t1, dst, len # t1 is just past last byte of dst 290 li bits, 8*NBYTES 291 SLL rem, len, 3 # rem = number of bits to keep 292EXC( LOAD t0, 0(src), l_exc) 293 SUB bits, bits, rem # bits = number of bits to discard 294 SHIFT_DISCARD t0, t0, bits 295EXC( STREST t0, -1(t1), s_exc) 296 jr ra 297 move len, zero 298dst_unaligned: 299 /* 300 * dst is unaligned 301 * t0 = src & ADDRMASK 302 * t1 = dst & ADDRMASK; T1 > 0 303 * len >= NBYTES 304 * 305 * Copy enough bytes to align dst 306 * Set match = (src and dst have same alignment) 307 */ 308#define match rem 309EXC( LDFIRST t3, FIRST(0)(src), l_exc) 310 ADD t2, zero, NBYTES 311EXC( LDREST t3, REST(0)(src), l_exc_copy) 312 SUB t2, t2, t1 # t2 = number of bytes copied 313 xor match, t0, t1 314EXC( STFIRST t3, FIRST(0)(dst), s_exc) 315 beq len, t2, done 316 SUB len, len, t2 317 ADD dst, dst, t2 318 beqz match, both_aligned 319 ADD src, src, t2 320 321src_unaligned_dst_aligned: 322 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 323 PREF( 0, 3*32(src) ) 324 beqz t0, cleanup_src_unaligned 325 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 326 PREF( 1, 3*32(dst) ) 3271: 328/* 329 * Avoid consecutive LD*'s to the same register since some mips 330 * implementations can't issue them in the same cycle. 331 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 332 * are to the same unit (unless src is aligned, but it's not). 333 */ 334EXC( LDFIRST t0, FIRST(0)(src), l_exc) 335EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 336 SUB len, len, 4*NBYTES 337EXC( LDREST t0, REST(0)(src), l_exc_copy) 338EXC( LDREST t1, REST(1)(src), l_exc_copy) 339EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 340EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 341EXC( LDREST t2, REST(2)(src), l_exc_copy) 342EXC( LDREST t3, REST(3)(src), l_exc_copy) 343 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) 344 ADD src, src, 4*NBYTES 345#ifdef CONFIG_CPU_SB1 346 nop # improves slotting 347#endif 348EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 349EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 350EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 351EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 352 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) 353 bne len, rem, 1b 354 ADD dst, dst, 4*NBYTES 355 356cleanup_src_unaligned: 357 beqz len, done 358 and rem, len, NBYTES-1 # rem = len % NBYTES 359 beq rem, len, copy_bytes 360 nop 3611: 362EXC( LDFIRST t0, FIRST(0)(src), l_exc) 363EXC( LDREST t0, REST(0)(src), l_exc_copy) 364 ADD src, src, NBYTES 365 SUB len, len, NBYTES 366EXC( STORE t0, 0(dst), s_exc_p1u) 367 bne len, rem, 1b 368 ADD dst, dst, NBYTES 369 370copy_bytes_checklen: 371 beqz len, done 372 nop 373copy_bytes: 374 /* 0 < len < NBYTES */ 375#define COPY_BYTE(N) \ 376EXC( lb t0, N(src), l_exc); \ 377 SUB len, len, 1; \ 378 beqz len, done; \ 379EXC( sb t0, N(dst), s_exc_p1) 380 381 COPY_BYTE(0) 382 COPY_BYTE(1) 383#ifdef USE_DOUBLE 384 COPY_BYTE(2) 385 COPY_BYTE(3) 386 COPY_BYTE(4) 387 COPY_BYTE(5) 388#endif 389EXC( lb t0, NBYTES-2(src), l_exc) 390 SUB len, len, 1 391 jr ra 392EXC( sb t0, NBYTES-2(dst), s_exc_p1) 393done: 394 jr ra 395 nop 396 END(memcpy) 397 398l_exc_copy: 399 /* 400 * Copy bytes from src until faulting load address (or until a 401 * lb faults) 402 * 403 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) 404 * may be more than a byte beyond the last address. 405 * Hence, the lb below may get an exception. 406 * 407 * Assumes src < THREAD_BUADDR($28) 408 */ 409 LOAD t0, TI_TASK($28) 410 nop 411 LOAD t0, THREAD_BUADDR(t0) 4121: 413EXC( lb t1, 0(src), l_exc) 414 ADD src, src, 1 415 sb t1, 0(dst) # can't fault -- we're copy_from_user 416 bne src, t0, 1b 417 ADD dst, dst, 1 418l_exc: 419 LOAD t0, TI_TASK($28) 420 nop 421 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 422 nop 423 SUB len, AT, t0 # len number of uncopied bytes 424 /* 425 * Here's where we rely on src and dst being incremented in tandem, 426 * See (3) above. 427 * dst += (fault addr - src) to put dst at first byte to clear 428 */ 429 ADD dst, t0 # compute start address in a1 430 SUB dst, src 431 /* 432 * Clear len bytes starting at dst. Can't call __bzero because it 433 * might modify len. An inefficient loop for these rare times... 434 */ 435 beqz len, done 436 SUB src, len, 1 4371: sb zero, 0(dst) 438 ADD dst, dst, 1 439 bnez src, 1b 440 SUB src, src, 1 441 jr ra 442 nop 443 444 445#define SEXC(n) \ 446s_exc_p ## n ## u: \ 447 jr ra; \ 448 ADD len, len, n*NBYTES 449 450SEXC(8) 451SEXC(7) 452SEXC(6) 453SEXC(5) 454SEXC(4) 455SEXC(3) 456SEXC(2) 457SEXC(1) 458 459s_exc_p1: 460 jr ra 461 ADD len, len, 1 462s_exc: 463 jr ra 464 nop 465 466 .align 5 467LEAF(memmove) 468 ADD t0, a0, a2 469 ADD t1, a1, a2 470 sltu t0, a1, t0 # dst + len <= src -> memcpy 471 sltu t1, a0, t1 # dst >= src + len -> memcpy 472 and t0, t1 473 beqz t0, __memcpy 474 move v0, a0 /* return value */ 475 beqz a2, r_out 476 END(memmove) 477 478 /* fall through to __rmemcpy */ 479LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ 480 sltu t0, a1, a0 481 beqz t0, r_end_bytes_up # src >= dst 482 nop 483 ADD a0, a2 # dst = dst + len 484 ADD a1, a2 # src = src + len 485 486r_end_bytes: 487 lb t0, -1(a1) 488 SUB a2, a2, 0x1 489 sb t0, -1(a0) 490 SUB a1, a1, 0x1 491 bnez a2, r_end_bytes 492 SUB a0, a0, 0x1 493 494r_out: 495 jr ra 496 move a2, zero 497 498r_end_bytes_up: 499 lb t0, (a1) 500 SUB a2, a2, 0x1 501 sb t0, (a0) 502 ADD a1, a1, 0x1 503 bnez a2, r_end_bytes_up 504 ADD a0, a0, 0x1 505 506 jr ra 507 move a2, zero 508 END(__rmemcpy) 509