1*1da177e4SLinus Torvalds/* Optimized version of the standard memset() function. 2*1da177e4SLinus Torvalds 3*1da177e4SLinus Torvalds Copyright (c) 2002 Hewlett-Packard Co/CERN 4*1da177e4SLinus Torvalds Sverre Jarp <Sverre.Jarp@cern.ch> 5*1da177e4SLinus Torvalds 6*1da177e4SLinus Torvalds Return: dest 7*1da177e4SLinus Torvalds 8*1da177e4SLinus Torvalds Inputs: 9*1da177e4SLinus Torvalds in0: dest 10*1da177e4SLinus Torvalds in1: value 11*1da177e4SLinus Torvalds in2: count 12*1da177e4SLinus Torvalds 13*1da177e4SLinus Torvalds The algorithm is fairly straightforward: set byte by byte until we 14*1da177e4SLinus Torvalds we get to a 16B-aligned address, then loop on 128 B chunks using an 15*1da177e4SLinus Torvalds early store as prefetching, then loop on 32B chucks, then clear remaining 16*1da177e4SLinus Torvalds words, finally clear remaining bytes. 17*1da177e4SLinus Torvalds Since a stf.spill f0 can store 16B in one go, we use this instruction 18*1da177e4SLinus Torvalds to get peak speed when value = 0. */ 19*1da177e4SLinus Torvalds 20*1da177e4SLinus Torvalds#include <asm/asmmacro.h> 21*1da177e4SLinus Torvalds#undef ret 22*1da177e4SLinus Torvalds 23*1da177e4SLinus Torvalds#define dest in0 24*1da177e4SLinus Torvalds#define value in1 25*1da177e4SLinus Torvalds#define cnt in2 26*1da177e4SLinus Torvalds 27*1da177e4SLinus Torvalds#define tmp r31 28*1da177e4SLinus Torvalds#define save_lc r30 29*1da177e4SLinus Torvalds#define ptr0 r29 30*1da177e4SLinus Torvalds#define ptr1 r28 31*1da177e4SLinus Torvalds#define ptr2 r27 32*1da177e4SLinus Torvalds#define ptr3 r26 33*1da177e4SLinus Torvalds#define ptr9 r24 34*1da177e4SLinus Torvalds#define loopcnt r23 35*1da177e4SLinus Torvalds#define linecnt r22 36*1da177e4SLinus Torvalds#define bytecnt r21 37*1da177e4SLinus Torvalds 38*1da177e4SLinus Torvalds#define fvalue f6 39*1da177e4SLinus Torvalds 40*1da177e4SLinus Torvalds// This routine uses only scratch predicate registers (p6 - p15) 41*1da177e4SLinus Torvalds#define p_scr p6 // default register for same-cycle branches 42*1da177e4SLinus Torvalds#define p_nz p7 43*1da177e4SLinus Torvalds#define p_zr p8 44*1da177e4SLinus Torvalds#define p_unalgn p9 45*1da177e4SLinus Torvalds#define p_y p11 46*1da177e4SLinus Torvalds#define p_n p12 47*1da177e4SLinus Torvalds#define p_yy p13 48*1da177e4SLinus Torvalds#define p_nn p14 49*1da177e4SLinus Torvalds 50*1da177e4SLinus Torvalds#define MIN1 15 51*1da177e4SLinus Torvalds#define MIN1P1HALF 8 52*1da177e4SLinus Torvalds#define LINE_SIZE 128 53*1da177e4SLinus Torvalds#define LSIZE_SH 7 // shift amount 54*1da177e4SLinus Torvalds#define PREF_AHEAD 8 55*1da177e4SLinus Torvalds 56*1da177e4SLinus TorvaldsGLOBAL_ENTRY(memset) 57*1da177e4SLinus Torvalds{ .mmi 58*1da177e4SLinus Torvalds .prologue 59*1da177e4SLinus Torvalds alloc tmp = ar.pfs, 3, 0, 0, 0 60*1da177e4SLinus Torvalds .body 61*1da177e4SLinus Torvalds lfetch.nt1 [dest] // 62*1da177e4SLinus Torvalds .save ar.lc, save_lc 63*1da177e4SLinus Torvalds mov.i save_lc = ar.lc 64*1da177e4SLinus Torvalds} { .mmi 65*1da177e4SLinus Torvalds mov ret0 = dest // return value 66*1da177e4SLinus Torvalds cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero 67*1da177e4SLinus Torvalds cmp.eq p_scr, p0 = cnt, r0 68*1da177e4SLinus Torvalds;; } 69*1da177e4SLinus Torvalds{ .mmi 70*1da177e4SLinus Torvalds and ptr2 = -(MIN1+1), dest // aligned address 71*1da177e4SLinus Torvalds and tmp = MIN1, dest // prepare to check for correct alignment 72*1da177e4SLinus Torvalds tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U) 73*1da177e4SLinus Torvalds} { .mib 74*1da177e4SLinus Torvalds mov ptr1 = dest 75*1da177e4SLinus Torvalds mux1 value = value, @brcst // create 8 identical bytes in word 76*1da177e4SLinus Torvalds(p_scr) br.ret.dpnt.many rp // return immediately if count = 0 77*1da177e4SLinus Torvalds;; } 78*1da177e4SLinus Torvalds{ .mib 79*1da177e4SLinus Torvalds cmp.ne p_unalgn, p0 = tmp, r0 // 80*1da177e4SLinus Torvalds} { .mib 81*1da177e4SLinus Torvalds sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt 82*1da177e4SLinus Torvalds cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task? 83*1da177e4SLinus Torvalds(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U) 84*1da177e4SLinus Torvalds;; } 85*1da177e4SLinus Torvalds{ .mmi 86*1da177e4SLinus Torvalds(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment 87*1da177e4SLinus Torvalds(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment 88*1da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ? 89*1da177e4SLinus Torvalds;; } 90*1da177e4SLinus Torvalds{ .mib 91*1da177e4SLinus Torvalds(p_y) add cnt = -8, cnt // 92*1da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ? 93*1da177e4SLinus Torvalds} { .mib 94*1da177e4SLinus Torvalds(p_y) st8 [ptr2] = value,-4 // 95*1da177e4SLinus Torvalds(p_n) add ptr2 = 4, ptr2 // 96*1da177e4SLinus Torvalds;; } 97*1da177e4SLinus Torvalds{ .mib 98*1da177e4SLinus Torvalds(p_yy) add cnt = -4, cnt // 99*1da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ? 100*1da177e4SLinus Torvalds} { .mib 101*1da177e4SLinus Torvalds(p_yy) st4 [ptr2] = value,-2 // 102*1da177e4SLinus Torvalds(p_nn) add ptr2 = 2, ptr2 // 103*1da177e4SLinus Torvalds;; } 104*1da177e4SLinus Torvalds{ .mmi 105*1da177e4SLinus Torvalds mov tmp = LINE_SIZE+1 // for compare 106*1da177e4SLinus Torvalds(p_y) add cnt = -2, cnt // 107*1da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ? 108*1da177e4SLinus Torvalds} { .mmi 109*1da177e4SLinus Torvalds setf.sig fvalue=value // transfer value to FLP side 110*1da177e4SLinus Torvalds(p_y) st2 [ptr2] = value,-1 // 111*1da177e4SLinus Torvalds(p_n) add ptr2 = 1, ptr2 // 112*1da177e4SLinus Torvalds;; } 113*1da177e4SLinus Torvalds 114*1da177e4SLinus Torvalds{ .mmi 115*1da177e4SLinus Torvalds(p_yy) st1 [ptr2] = value // 116*1da177e4SLinus Torvalds cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task? 117*1da177e4SLinus Torvalds} { .mbb 118*1da177e4SLinus Torvalds(p_yy) add cnt = -1, cnt // 119*1da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few 120*1da177e4SLinus Torvalds;; } 121*1da177e4SLinus Torvalds 122*1da177e4SLinus Torvalds{ .mib 123*1da177e4SLinus Torvalds nop.m 0 124*1da177e4SLinus Torvalds shr.u linecnt = cnt, LSIZE_SH 125*1da177e4SLinus Torvalds(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill 126*1da177e4SLinus Torvalds;; } 127*1da177e4SLinus Torvalds 128*1da177e4SLinus Torvalds TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later 129*1da177e4SLinus Torvalds{ .mmi 130*1da177e4SLinus Torvalds and tmp = -(LINE_SIZE), cnt // compute end of range 131*1da177e4SLinus Torvalds mov ptr9 = ptr1 // used for prefetching 132*1da177e4SLinus Torvalds and cnt = (LINE_SIZE-1), cnt // remainder 133*1da177e4SLinus Torvalds} { .mmi 134*1da177e4SLinus Torvalds mov loopcnt = PREF_AHEAD-1 // default prefetch loop 135*1da177e4SLinus Torvalds cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value 136*1da177e4SLinus Torvalds;; } 137*1da177e4SLinus Torvalds{ .mmi 138*1da177e4SLinus Torvalds(p_scr) add loopcnt = -1, linecnt // 139*1da177e4SLinus Torvalds add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores) 140*1da177e4SLinus Torvalds add ptr1 = tmp, ptr1 // first address beyond total range 141*1da177e4SLinus Torvalds;; } 142*1da177e4SLinus Torvalds{ .mmi 143*1da177e4SLinus Torvalds add tmp = -1, linecnt // next loop count 144*1da177e4SLinus Torvalds mov.i ar.lc = loopcnt // 145*1da177e4SLinus Torvalds;; } 146*1da177e4SLinus Torvalds.pref_l1a: 147*1da177e4SLinus Torvalds{ .mib 148*1da177e4SLinus Torvalds stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart 149*1da177e4SLinus Torvalds nop.i 0 150*1da177e4SLinus Torvalds br.cloop.dptk.few .pref_l1a 151*1da177e4SLinus Torvalds;; } 152*1da177e4SLinus Torvalds{ .mmi 153*1da177e4SLinus Torvalds add ptr0 = 16, ptr2 // Two stores in parallel 154*1da177e4SLinus Torvalds mov.i ar.lc = tmp // 155*1da177e4SLinus Torvalds;; } 156*1da177e4SLinus Torvalds.l1ax: 157*1da177e4SLinus Torvalds { .mmi 158*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 159*1da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 8 160*1da177e4SLinus Torvalds ;; } 161*1da177e4SLinus Torvalds { .mmi 162*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 163*1da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 24 164*1da177e4SLinus Torvalds ;; } 165*1da177e4SLinus Torvalds { .mmi 166*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 167*1da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 8 168*1da177e4SLinus Torvalds ;; } 169*1da177e4SLinus Torvalds { .mmi 170*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 171*1da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 24 172*1da177e4SLinus Torvalds ;; } 173*1da177e4SLinus Torvalds { .mmi 174*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 175*1da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 8 176*1da177e4SLinus Torvalds ;; } 177*1da177e4SLinus Torvalds { .mmi 178*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 179*1da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 24 180*1da177e4SLinus Torvalds ;; } 181*1da177e4SLinus Torvalds { .mmi 182*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 183*1da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 32 184*1da177e4SLinus Torvalds cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? 185*1da177e4SLinus Torvalds ;; } 186*1da177e4SLinus Torvalds{ .mmb 187*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 188*1da177e4SLinus Torvalds(p_scr) stf8 [ptr9] = fvalue, 128 189*1da177e4SLinus Torvalds br.cloop.dptk.few .l1ax 190*1da177e4SLinus Torvalds;; } 191*1da177e4SLinus Torvalds{ .mbb 192*1da177e4SLinus Torvalds cmp.le p_scr, p0 = 8, cnt // just a few bytes left ? 193*1da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2 194*1da177e4SLinus Torvalds br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3 195*1da177e4SLinus Torvalds;; } 196*1da177e4SLinus Torvalds 197*1da177e4SLinus Torvalds TEXT_ALIGN(32) 198*1da177e4SLinus Torvalds.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later 199*1da177e4SLinus Torvalds{ .mmi 200*1da177e4SLinus Torvalds and tmp = -(LINE_SIZE), cnt // compute end of range 201*1da177e4SLinus Torvalds mov ptr9 = ptr1 // used for prefetching 202*1da177e4SLinus Torvalds and cnt = (LINE_SIZE-1), cnt // remainder 203*1da177e4SLinus Torvalds} { .mmi 204*1da177e4SLinus Torvalds mov loopcnt = PREF_AHEAD-1 // default prefetch loop 205*1da177e4SLinus Torvalds cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value 206*1da177e4SLinus Torvalds;; } 207*1da177e4SLinus Torvalds{ .mmi 208*1da177e4SLinus Torvalds(p_scr) add loopcnt = -1, linecnt 209*1da177e4SLinus Torvalds add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores) 210*1da177e4SLinus Torvalds add ptr1 = tmp, ptr1 // first address beyond total range 211*1da177e4SLinus Torvalds;; } 212*1da177e4SLinus Torvalds{ .mmi 213*1da177e4SLinus Torvalds add tmp = -1, linecnt // next loop count 214*1da177e4SLinus Torvalds mov.i ar.lc = loopcnt 215*1da177e4SLinus Torvalds;; } 216*1da177e4SLinus Torvalds.pref_l1b: 217*1da177e4SLinus Torvalds{ .mib 218*1da177e4SLinus Torvalds stf.spill [ptr9] = f0, 128 // Do stores one cache line apart 219*1da177e4SLinus Torvalds nop.i 0 220*1da177e4SLinus Torvalds br.cloop.dptk.few .pref_l1b 221*1da177e4SLinus Torvalds;; } 222*1da177e4SLinus Torvalds{ .mmi 223*1da177e4SLinus Torvalds add ptr0 = 16, ptr2 // Two stores in parallel 224*1da177e4SLinus Torvalds mov.i ar.lc = tmp 225*1da177e4SLinus Torvalds;; } 226*1da177e4SLinus Torvalds.l1bx: 227*1da177e4SLinus Torvalds { .mmi 228*1da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 229*1da177e4SLinus Torvalds stf.spill [ptr0] = f0, 32 230*1da177e4SLinus Torvalds ;; } 231*1da177e4SLinus Torvalds { .mmi 232*1da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 233*1da177e4SLinus Torvalds stf.spill [ptr0] = f0, 32 234*1da177e4SLinus Torvalds ;; } 235*1da177e4SLinus Torvalds { .mmi 236*1da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 237*1da177e4SLinus Torvalds stf.spill [ptr0] = f0, 64 238*1da177e4SLinus Torvalds cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? 239*1da177e4SLinus Torvalds ;; } 240*1da177e4SLinus Torvalds{ .mmb 241*1da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 242*1da177e4SLinus Torvalds(p_scr) stf.spill [ptr9] = f0, 128 243*1da177e4SLinus Torvalds br.cloop.dptk.few .l1bx 244*1da177e4SLinus Torvalds;; } 245*1da177e4SLinus Torvalds{ .mib 246*1da177e4SLinus Torvalds cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? 247*1da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // 248*1da177e4SLinus Torvalds;; } 249*1da177e4SLinus Torvalds 250*1da177e4SLinus Torvalds.fraction_of_line: 251*1da177e4SLinus Torvalds{ .mib 252*1da177e4SLinus Torvalds add ptr2 = 16, ptr1 253*1da177e4SLinus Torvalds shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32 254*1da177e4SLinus Torvalds;; } 255*1da177e4SLinus Torvalds{ .mib 256*1da177e4SLinus Torvalds cmp.eq p_scr, p0 = loopcnt, r0 257*1da177e4SLinus Torvalds add loopcnt = -1, loopcnt 258*1da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .store_words 259*1da177e4SLinus Torvalds;; } 260*1da177e4SLinus Torvalds{ .mib 261*1da177e4SLinus Torvalds and cnt = 0x1f, cnt // compute the remaining cnt 262*1da177e4SLinus Torvalds mov.i ar.lc = loopcnt 263*1da177e4SLinus Torvalds;; } 264*1da177e4SLinus Torvalds TEXT_ALIGN(32) 265*1da177e4SLinus Torvalds.l2: // ------------------------------------ // L2A: store 32B in 2 cycles 266*1da177e4SLinus Torvalds{ .mmb 267*1da177e4SLinus Torvalds stf8 [ptr1] = fvalue, 8 268*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 269*1da177e4SLinus Torvalds;; } { .mmb 270*1da177e4SLinus Torvalds stf8 [ptr1] = fvalue, 24 271*1da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 272*1da177e4SLinus Torvalds br.cloop.dptk.many .l2 273*1da177e4SLinus Torvalds;; } 274*1da177e4SLinus Torvalds.store_words: 275*1da177e4SLinus Torvalds{ .mib 276*1da177e4SLinus Torvalds cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? 277*1da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch 278*1da177e4SLinus Torvalds;; } 279*1da177e4SLinus Torvalds 280*1da177e4SLinus Torvalds{ .mmi 281*1da177e4SLinus Torvalds stf8 [ptr1] = fvalue, 8 // store 282*1da177e4SLinus Torvalds cmp.le p_y, p_n = 16, cnt 283*1da177e4SLinus Torvalds add cnt = -8, cnt // subtract 284*1da177e4SLinus Torvalds;; } 285*1da177e4SLinus Torvalds{ .mmi 286*1da177e4SLinus Torvalds(p_y) stf8 [ptr1] = fvalue, 8 // store 287*1da177e4SLinus Torvalds(p_y) cmp.le.unc p_yy, p_nn = 16, cnt 288*1da177e4SLinus Torvalds(p_y) add cnt = -8, cnt // subtract 289*1da177e4SLinus Torvalds;; } 290*1da177e4SLinus Torvalds{ .mmi // store 291*1da177e4SLinus Torvalds(p_yy) stf8 [ptr1] = fvalue, 8 292*1da177e4SLinus Torvalds(p_yy) add cnt = -8, cnt // subtract 293*1da177e4SLinus Torvalds;; } 294*1da177e4SLinus Torvalds 295*1da177e4SLinus Torvalds.move_bytes_from_alignment: 296*1da177e4SLinus Torvalds{ .mib 297*1da177e4SLinus Torvalds cmp.eq p_scr, p0 = cnt, r0 298*1da177e4SLinus Torvalds tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ? 299*1da177e4SLinus Torvalds(p_scr) br.cond.dpnt.few .restore_and_exit 300*1da177e4SLinus Torvalds;; } 301*1da177e4SLinus Torvalds{ .mib 302*1da177e4SLinus Torvalds(p_y) st4 [ptr1] = value,4 303*1da177e4SLinus Torvalds tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ? 304*1da177e4SLinus Torvalds;; } 305*1da177e4SLinus Torvalds{ .mib 306*1da177e4SLinus Torvalds(p_yy) st2 [ptr1] = value,2 307*1da177e4SLinus Torvalds tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ? 308*1da177e4SLinus Torvalds;; } 309*1da177e4SLinus Torvalds 310*1da177e4SLinus Torvalds{ .mib 311*1da177e4SLinus Torvalds(p_y) st1 [ptr1] = value 312*1da177e4SLinus Torvalds;; } 313*1da177e4SLinus Torvalds.restore_and_exit: 314*1da177e4SLinus Torvalds{ .mib 315*1da177e4SLinus Torvalds nop.m 0 316*1da177e4SLinus Torvalds mov.i ar.lc = save_lc 317*1da177e4SLinus Torvalds br.ret.sptk.many rp 318*1da177e4SLinus Torvalds;; } 319*1da177e4SLinus Torvalds 320*1da177e4SLinus Torvalds.move_bytes_unaligned: 321*1da177e4SLinus Torvalds{ .mmi 322*1da177e4SLinus Torvalds .pred.rel "mutex",p_y, p_n 323*1da177e4SLinus Torvalds .pred.rel "mutex",p_yy, p_nn 324*1da177e4SLinus Torvalds(p_n) cmp.le p_yy, p_nn = 4, cnt 325*1da177e4SLinus Torvalds(p_y) cmp.le p_yy, p_nn = 5, cnt 326*1da177e4SLinus Torvalds(p_n) add ptr2 = 2, ptr1 327*1da177e4SLinus Torvalds} { .mmi 328*1da177e4SLinus Torvalds(p_y) add ptr2 = 3, ptr1 329*1da177e4SLinus Torvalds(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left] 330*1da177e4SLinus Torvalds(p_y) add cnt = -1, cnt 331*1da177e4SLinus Torvalds;; } 332*1da177e4SLinus Torvalds{ .mmi 333*1da177e4SLinus Torvalds(p_yy) cmp.le.unc p_y, p0 = 8, cnt 334*1da177e4SLinus Torvalds add ptr3 = ptr1, cnt // prepare last store 335*1da177e4SLinus Torvalds mov.i ar.lc = save_lc 336*1da177e4SLinus Torvalds} { .mmi 337*1da177e4SLinus Torvalds(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes 338*1da177e4SLinus Torvalds(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left] 339*1da177e4SLinus Torvalds(p_yy) add cnt = -4, cnt 340*1da177e4SLinus Torvalds;; } 341*1da177e4SLinus Torvalds{ .mmi 342*1da177e4SLinus Torvalds(p_y) cmp.le.unc p_yy, p0 = 8, cnt 343*1da177e4SLinus Torvalds add ptr3 = -1, ptr3 // last store 344*1da177e4SLinus Torvalds tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ? 345*1da177e4SLinus Torvalds} { .mmi 346*1da177e4SLinus Torvalds(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes 347*1da177e4SLinus Torvalds(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left] 348*1da177e4SLinus Torvalds(p_y) add cnt = -4, cnt 349*1da177e4SLinus Torvalds;; } 350*1da177e4SLinus Torvalds{ .mmi 351*1da177e4SLinus Torvalds(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes 352*1da177e4SLinus Torvalds(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left] 353*1da177e4SLinus Torvalds tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ? 354*1da177e4SLinus Torvalds} { .mmi 355*1da177e4SLinus Torvalds(p_yy) add cnt = -4, cnt 356*1da177e4SLinus Torvalds;; } 357*1da177e4SLinus Torvalds{ .mmb 358*1da177e4SLinus Torvalds(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes 359*1da177e4SLinus Torvalds(p_y) st1 [ptr3] = value // fill last byte (using ptr3) 360*1da177e4SLinus Torvalds br.ret.sptk.many rp 361*1da177e4SLinus Torvalds} 362*1da177e4SLinus TorvaldsEND(memset) 363