1/* 2 * Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved. 3 * 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License version 2 and 7 * only version 2 as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 17 * 02110-1301, USA. 18 */ 19 20/* 21 * Description 22 * 23 * library function for memcpy where length bytes are copied from 24 * ptr_in to ptr_out. ptr_out is returned unchanged. 25 * Allows any combination of alignment on input and output pointers 26 * and length from 0 to 2^32-1 27 * 28 * Restrictions 29 * The arrays should not overlap, the program will produce undefined output 30 * if they do. 31 * For blocks less than 16 bytes a byte by byte copy is performed. For 32 * 8byte alignments, and length multiples, a dword copy is performed up to 33 * 96bytes 34 * History 35 * 36 * DJH 5/15/09 Initial version 1.0 37 * DJH 6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19 38 * DJH 7/12/09 Version 1.2 optimized codesize down to 760 was 840 39 * DJH 10/14/09 Version 1.3 added special loop for aligned case, was 40 * overreading bloated codesize back up to 892 41 * DJH 4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads 42 * occuring if only 1 left outstanding, fixes bug 43 * # 3888, corrected for all alignments. Peeled off 44 * 1 32byte chunk from kernel loop and extended 8byte 45 * loop at end to solve all combinations and prevent 46 * over read. Fixed Ldword_loop_prolog to prevent 47 * overread for blocks less than 48bytes. Reduced 48 * codesize to 752 bytes 49 * DJH 4/21/10 version 1.5 1.4 fix broke code for input block ends not 50 * aligned to dword boundaries,underwriting by 1 51 * byte, added detection for this and fixed. A 52 * little bloat. 53 * DJH 4/23/10 version 1.6 corrected stack error, R20 was not being restored 54 * always, fixed the error of R20 being modified 55 * before it was being saved 56 * Natural c model 57 * =============== 58 * void * memcpy(char * ptr_out, char * ptr_in, int length) { 59 * int i; 60 * if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; } 61 * return(ptr_out); 62 * } 63 * 64 * Optimized memcpy function 65 * ========================= 66 * void * memcpy(char * ptr_out, char * ptr_in, int len) { 67 * int i, prolog, kernel, epilog, mask; 68 * u8 offset; 69 * s64 data0, dataF8, data70; 70 * 71 * s64 * ptr8_in; 72 * s64 * ptr8_out; 73 * s32 * ptr4; 74 * s16 * ptr2; 75 * 76 * offset = ((int) ptr_in) & 7; 77 * ptr8_in = (s64 *) &ptr_in[-offset]; //read in the aligned pointers 78 * 79 * data70 = *ptr8_in++; 80 * dataF8 = *ptr8_in++; 81 * 82 * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 83 * 84 * prolog = 32 - ((int) ptr_out); 85 * mask = 0x7fffffff >> HEXAGON_R_cl0_R(len); 86 * prolog = prolog & mask; 87 * kernel = len - prolog; 88 * epilog = kernel & 0x1F; 89 * kernel = kernel>>5; 90 * 91 * if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;} 92 * ptr2 = (s16 *) &ptr_out[0]; 93 * if (prolog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 94 * ptr4 = (s32 *) &ptr_out[0]; 95 * if (prolog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 96 * 97 * offset = offset + (prolog & 7); 98 * if (offset >= 8) { 99 * data70 = dataF8; 100 * dataF8 = *ptr8_in++; 101 * } 102 * offset = offset & 0x7; 103 * 104 * prolog = prolog >> 3; 105 * if (prolog) for (i=0; i < prolog; i++) { 106 * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 107 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 108 * data70 = dataF8; 109 * dataF8 = *ptr8_in++; 110 * } 111 * if(kernel) { kernel -= 1; epilog += 32; } 112 * if(kernel) for(i=0; i < kernel; i++) { 113 * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 114 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 115 * data70 = *ptr8_in++; 116 * 117 * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 118 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 119 * dataF8 = *ptr8_in++; 120 * 121 * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 122 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 123 * data70 = *ptr8_in++; 124 * 125 * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); 126 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 127 * dataF8 = *ptr8_in++; 128 * } 129 * epilogdws = epilog >> 3; 130 * if (epilogdws) for (i=0; i < epilogdws; i++) { 131 * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 132 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; 133 * data70 = dataF8; 134 * dataF8 = *ptr8_in++; 135 * } 136 * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); 137 * 138 * ptr4 = (s32 *) &ptr_out[0]; 139 * if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} 140 * ptr2 = (s16 *) &ptr_out[0]; 141 * if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} 142 * if (epilog & 1) { *ptr_out++ = (u8) data0; } 143 * 144 * return(ptr_out - length); 145 * } 146 * 147 * Codesize : 784 bytes 148 */ 149 150 151#define ptr_out R0 /* destination pounter */ 152#define ptr_in R1 /* source pointer */ 153#define len R2 /* length of copy in bytes */ 154 155#define data70 R13:12 /* lo 8 bytes of non-aligned transfer */ 156#define dataF8 R11:10 /* hi 8 bytes of non-aligned transfer */ 157#define ldata0 R7:6 /* even 8 bytes chunks */ 158#define ldata1 R25:24 /* odd 8 bytes chunks */ 159#define data1 R7 /* lower 8 bytes of ldata1 */ 160#define data0 R6 /* lower 8 bytes of ldata0 */ 161 162#define ifbyte p0 /* if transfer has bytes in epilog/prolog */ 163#define ifhword p0 /* if transfer has shorts in epilog/prolog */ 164#define ifword p0 /* if transfer has words in epilog/prolog */ 165#define noprolog p0 /* no prolog, xfer starts at 32byte */ 166#define nokernel p1 /* no 32byte multiple block in the transfer */ 167#define noepilog p0 /* no epilog, xfer ends on 32byte boundary */ 168#define align p2 /* alignment of input rel to 8byte boundary */ 169#define kernel1 p0 /* kernel count == 1 */ 170 171#define dalign R25 /* rel alignment of input to output data */ 172#define star3 R16 /* number bytes in prolog - dwords */ 173#define rest R8 /* length - prolog bytes */ 174#define back R7 /* nr bytes > dword boundary in src block */ 175#define epilog R3 /* bytes in epilog */ 176#define inc R15:14 /* inc kernel by -1 and defetch ptr by 32 */ 177#define kernel R4 /* number of 32byte chunks in kernel */ 178#define ptr_in_p_128 R5 /* pointer for prefetch of input data */ 179#define mask R8 /* mask used to determine prolog size */ 180#define shift R8 /* used to work a shifter to extract bytes */ 181#define shift2 R5 /* in epilog to workshifter to extract bytes */ 182#define prolog R15 /* bytes in prolog */ 183#define epilogdws R15 /* number dwords in epilog */ 184#define shiftb R14 /* used to extract bytes */ 185#define offset R9 /* same as align in reg */ 186#define ptr_out_p_32 R17 /* pointer to output dczero */ 187#define align888 R14 /* if simple dword loop can be used */ 188#define len8 R9 /* number of dwords in length */ 189#define over R20 /* nr of bytes > last inp buf dword boundary */ 190 191#define ptr_in_p_128kernel R5:4 /* packed fetch pointer & kernel cnt */ 192 193 .section .text 194 .p2align 4 195 .global memcpy 196 .type memcpy, @function 197memcpy: 198{ 199 p2 = cmp.eq(len, #0); /* =0 */ 200 align888 = or(ptr_in, ptr_out); /* %8 < 97 */ 201 p0 = cmp.gtu(len, #23); /* %1, <24 */ 202 p1 = cmp.eq(ptr_in, ptr_out); /* attempt to overwrite self */ 203} 204{ 205 p1 = or(p2, p1); 206 p3 = cmp.gtu(len, #95); /* %8 < 97 */ 207 align888 = or(align888, len); /* %8 < 97 */ 208 len8 = lsr(len, #3); /* %8 < 97 */ 209} 210{ 211 dcfetch(ptr_in); /* zero/ptrin=ptrout causes fetch */ 212 p2 = bitsclr(align888, #7); /* %8 < 97 */ 213 if(p1) jumpr r31; /* =0 */ 214} 215{ 216 p2 = and(p2,!p3); /* %8 < 97 */ 217 if (p2.new) len = add(len, #-8); /* %8 < 97 */ 218 if (p2.new) jump:NT .Ldwordaligned; /* %8 < 97 */ 219} 220{ 221 if(!p0) jump .Lbytes23orless; /* %1, <24 */ 222 mask.l = #LO(0x7fffffff); 223 /* all bytes before line multiples of data */ 224 prolog = sub(#0, ptr_out); 225} 226{ 227 /* save r31 on stack, decrement sp by 16 */ 228 allocframe(#24); 229 mask.h = #HI(0x7fffffff); 230 ptr_in_p_128 = add(ptr_in, #32); 231 back = cl0(len); 232} 233{ 234 memd(sp+#0) = R17:16; /* save r16,r17 on stack6 */ 235 r31.l = #LO(.Lmemcpy_return); /* set up final return pointer */ 236 prolog &= lsr(mask, back); 237 offset = and(ptr_in, #7); 238} 239{ 240 memd(sp+#8) = R25:24; /* save r25,r24 on stack */ 241 dalign = sub(ptr_out, ptr_in); 242 r31.h = #HI(.Lmemcpy_return); /* set up final return pointer */ 243} 244{ 245 /* see if there if input buffer end if aligned */ 246 over = add(len, ptr_in); 247 back = add(len, offset); 248 memd(sp+#16) = R21:20; /* save r20,r21 on stack */ 249} 250{ 251 noprolog = bitsclr(prolog, #7); 252 prolog = and(prolog, #31); 253 dcfetch(ptr_in_p_128); 254 ptr_in_p_128 = add(ptr_in_p_128, #32); 255} 256{ 257 kernel = sub(len, prolog); 258 shift = asl(prolog, #3); 259 star3 = and(prolog, #7); 260 ptr_in = and(ptr_in, #-8); 261} 262{ 263 prolog = lsr(prolog, #3); 264 epilog = and(kernel, #31); 265 ptr_out_p_32 = add(ptr_out, prolog); 266 over = and(over, #7); 267} 268{ 269 p3 = cmp.gtu(back, #8); 270 kernel = lsr(kernel, #5); 271 dcfetch(ptr_in_p_128); 272 ptr_in_p_128 = add(ptr_in_p_128, #32); 273} 274{ 275 p1 = cmp.eq(prolog, #0); 276 if(!p1.new) prolog = add(prolog, #1); 277 dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 278 ptr_in_p_128 = add(ptr_in_p_128, #32); 279} 280{ 281 nokernel = cmp.eq(kernel,#0); 282 dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 283 ptr_in_p_128 = add(ptr_in_p_128, #32); 284 shiftb = and(shift, #8); 285} 286{ 287 dcfetch(ptr_in_p_128); /* reserve the line 64bytes on */ 288 ptr_in_p_128 = add(ptr_in_p_128, #32); 289 if(nokernel) jump .Lskip64; 290 p2 = cmp.eq(kernel, #1); /* skip ovr if kernel == 0 */ 291} 292{ 293 dczeroa(ptr_out_p_32); 294 /* don't advance pointer */ 295 if(!p2) ptr_out_p_32 = add(ptr_out_p_32, #32); 296} 297{ 298 dalign = and(dalign, #31); 299 dczeroa(ptr_out_p_32); 300} 301.Lskip64: 302{ 303 data70 = memd(ptr_in++#16); 304 if(p3) dataF8 = memd(ptr_in+#8); 305 if(noprolog) jump .Lnoprolog32; 306 align = offset; 307} 308/* upto initial 7 bytes */ 309{ 310 ldata0 = valignb(dataF8, data70, align); 311 ifbyte = tstbit(shift,#3); 312 offset = add(offset, star3); 313} 314{ 315 if(ifbyte) memb(ptr_out++#1) = data0; 316 ldata0 = lsr(ldata0, shiftb); 317 shiftb = and(shift, #16); 318 ifhword = tstbit(shift,#4); 319} 320{ 321 if(ifhword) memh(ptr_out++#2) = data0; 322 ldata0 = lsr(ldata0, shiftb); 323 ifword = tstbit(shift,#5); 324 p2 = cmp.gtu(offset, #7); 325} 326{ 327 if(ifword) memw(ptr_out++#4) = data0; 328 if(p2) data70 = dataF8; 329 if(p2) dataF8 = memd(ptr_in++#8); /* another 8 bytes */ 330 align = offset; 331} 332.Lnoprolog32: 333{ 334 p3 = sp1loop0(.Ldword_loop_prolog, prolog) 335 rest = sub(len, star3); /* whats left after the loop */ 336 p0 = cmp.gt(over, #0); 337} 338 if(p0) rest = add(rest, #16); 339.Ldword_loop_prolog: 340{ 341 if(p3) memd(ptr_out++#8) = ldata0; 342 ldata0 = valignb(dataF8, data70, align); 343 p0 = cmp.gt(rest, #16); 344} 345{ 346 data70 = dataF8; 347 if(p0) dataF8 = memd(ptr_in++#8); 348 rest = add(rest, #-8); 349}:endloop0 350.Lkernel: 351{ 352 /* kernel is at least 32bytes */ 353 p3 = cmp.gtu(kernel, #0); 354 /* last itn. remove edge effects */ 355 if(p3.new) kernel = add(kernel, #-1); 356 /* dealt with in last dword loop */ 357 if(p3.new) epilog = add(epilog, #32); 358} 359{ 360 nokernel = cmp.eq(kernel, #0); /* after adjustment, recheck */ 361 if(nokernel.new) jump:NT .Lepilog; /* likely not taken */ 362 inc = combine(#32, #-1); 363 p3 = cmp.gtu(dalign, #24); 364} 365{ 366 if(p3) jump .Lodd_alignment; 367} 368{ 369 loop0(.Loword_loop_25to31, kernel); 370 kernel1 = cmp.gtu(kernel, #1); 371 rest = kernel; 372} 373 .falign 374.Loword_loop_25to31: 375{ 376 dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 377 if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 378} 379{ 380 dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 381 p3 = cmp.eq(kernel, rest); 382} 383{ 384 /* kernel -= 1 */ 385 ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 386 /* kill write on first iteration */ 387 if(!p3) memd(ptr_out++#8) = ldata1; 388 ldata1 = valignb(dataF8, data70, align); 389 data70 = memd(ptr_in++#8); 390} 391{ 392 memd(ptr_out++#8) = ldata0; 393 ldata0 = valignb(data70, dataF8, align); 394 dataF8 = memd(ptr_in++#8); 395} 396{ 397 memd(ptr_out++#8) = ldata1; 398 ldata1 = valignb(dataF8, data70, align); 399 data70 = memd(ptr_in++#8); 400} 401{ 402 memd(ptr_out++#8) = ldata0; 403 ldata0 = valignb(data70, dataF8, align); 404 dataF8 = memd(ptr_in++#8); 405 kernel1 = cmp.gtu(kernel, #1); 406}:endloop0 407{ 408 memd(ptr_out++#8) = ldata1; 409 jump .Lepilog; 410} 411.Lodd_alignment: 412{ 413 loop0(.Loword_loop_00to24, kernel); 414 kernel1 = cmp.gtu(kernel, #1); 415 rest = add(kernel, #-1); 416} 417 .falign 418.Loword_loop_00to24: 419{ 420 dcfetch(ptr_in_p_128); /* prefetch 4 lines ahead */ 421 ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc); 422 if(kernel1) ptr_out_p_32 = add(ptr_out_p_32, #32); 423} 424{ 425 dczeroa(ptr_out_p_32); /* reserve the next 32bytes in cache */ 426} 427{ 428 memd(ptr_out++#8) = ldata0; 429 ldata0 = valignb(dataF8, data70, align); 430 data70 = memd(ptr_in++#8); 431} 432{ 433 memd(ptr_out++#8) = ldata0; 434 ldata0 = valignb(data70, dataF8, align); 435 dataF8 = memd(ptr_in++#8); 436} 437{ 438 memd(ptr_out++#8) = ldata0; 439 ldata0 = valignb(dataF8, data70, align); 440 data70 = memd(ptr_in++#8); 441} 442{ 443 memd(ptr_out++#8) = ldata0; 444 ldata0 = valignb(data70, dataF8, align); 445 dataF8 = memd(ptr_in++#8); 446 kernel1 = cmp.gtu(kernel, #1); 447}:endloop0 448.Lepilog: 449{ 450 noepilog = cmp.eq(epilog,#0); 451 epilogdws = lsr(epilog, #3); 452 kernel = and(epilog, #7); 453} 454{ 455 if(noepilog) jumpr r31; 456 if(noepilog) ptr_out = sub(ptr_out, len); 457 p3 = cmp.eq(epilogdws, #0); 458 shift2 = asl(epilog, #3); 459} 460{ 461 shiftb = and(shift2, #32); 462 ifword = tstbit(epilog,#2); 463 if(p3) jump .Lepilog60; 464 if(!p3) epilog = add(epilog, #-16); 465} 466{ 467 loop0(.Ldword_loop_epilog, epilogdws); 468 /* stop criteria is lsbs unless = 0 then its 8 */ 469 p3 = cmp.eq(kernel, #0); 470 if(p3.new) kernel= #8; 471 p1 = cmp.gt(over, #0); 472} 473 /* if not aligned to end of buffer execute 1 more iteration */ 474 if(p1) kernel= #0; 475.Ldword_loop_epilog: 476{ 477 memd(ptr_out++#8) = ldata0; 478 ldata0 = valignb(dataF8, data70, align); 479 p3 = cmp.gt(epilog, kernel); 480} 481{ 482 data70 = dataF8; 483 if(p3) dataF8 = memd(ptr_in++#8); 484 epilog = add(epilog, #-8); 485}:endloop0 486/* copy last 7 bytes */ 487.Lepilog60: 488{ 489 if(ifword) memw(ptr_out++#4) = data0; 490 ldata0 = lsr(ldata0, shiftb); 491 ifhword = tstbit(epilog,#1); 492 shiftb = and(shift2, #16); 493} 494{ 495 if(ifhword) memh(ptr_out++#2) = data0; 496 ldata0 = lsr(ldata0, shiftb); 497 ifbyte = tstbit(epilog,#0); 498 if(ifbyte.new) len = add(len, #-1); 499} 500{ 501 if(ifbyte) memb(ptr_out) = data0; 502 ptr_out = sub(ptr_out, len); /* return dest pointer */ 503 jumpr r31; 504} 505/* do byte copy for small n */ 506.Lbytes23orless: 507{ 508 p3 = sp1loop0(.Lbyte_copy, len); 509 len = add(len, #-1); 510} 511.Lbyte_copy: 512{ 513 data0 = memb(ptr_in++#1); 514 if(p3) memb(ptr_out++#1) = data0; 515}:endloop0 516{ 517 memb(ptr_out) = data0; 518 ptr_out = sub(ptr_out, len); 519 jumpr r31; 520} 521/* do dword copies for aligned in, out and length */ 522.Ldwordaligned: 523{ 524 p3 = sp1loop0(.Ldword_copy, len8); 525} 526.Ldword_copy: 527{ 528 if(p3) memd(ptr_out++#8) = ldata0; 529 ldata0 = memd(ptr_in++#8); 530}:endloop0 531{ 532 memd(ptr_out) = ldata0; 533 ptr_out = sub(ptr_out, len); 534 jumpr r31; /* return to function caller */ 535} 536.Lmemcpy_return: 537 r21:20 = memd(sp+#16); /* restore r20+r21 */ 538{ 539 r25:24 = memd(sp+#8); /* restore r24+r25 */ 540 r17:16 = memd(sp+#0); /* restore r16+r17 */ 541} 542 deallocframe; /* restore r31 and incrment stack by 16 */ 543 jumpr r31 544