1b2441318SGreg Kroah-Hartman/* SPDX-License-Identifier: GPL-2.0 */ 21da177e4SLinus Torvalds/* Optimized version of the standard memset() function. 31da177e4SLinus Torvalds 41da177e4SLinus Torvalds Copyright (c) 2002 Hewlett-Packard Co/CERN 51da177e4SLinus Torvalds Sverre Jarp <Sverre.Jarp@cern.ch> 61da177e4SLinus Torvalds 71da177e4SLinus Torvalds Return: dest 81da177e4SLinus Torvalds 91da177e4SLinus Torvalds Inputs: 101da177e4SLinus Torvalds in0: dest 111da177e4SLinus Torvalds in1: value 121da177e4SLinus Torvalds in2: count 131da177e4SLinus Torvalds 141da177e4SLinus Torvalds The algorithm is fairly straightforward: set byte by byte until we 151da177e4SLinus Torvalds we get to a 16B-aligned address, then loop on 128 B chunks using an 161da177e4SLinus Torvalds early store as prefetching, then loop on 32B chucks, then clear remaining 171da177e4SLinus Torvalds words, finally clear remaining bytes. 181da177e4SLinus Torvalds Since a stf.spill f0 can store 16B in one go, we use this instruction 191da177e4SLinus Torvalds to get peak speed when value = 0. */ 201da177e4SLinus Torvalds 21*ab03e604SMasahiro Yamada#include <linux/export.h> 221da177e4SLinus Torvalds#include <asm/asmmacro.h> 231da177e4SLinus Torvalds#undef ret 241da177e4SLinus Torvalds 251da177e4SLinus Torvalds#define dest in0 261da177e4SLinus Torvalds#define value in1 271da177e4SLinus Torvalds#define cnt in2 281da177e4SLinus Torvalds 291da177e4SLinus Torvalds#define tmp r31 301da177e4SLinus Torvalds#define save_lc r30 311da177e4SLinus Torvalds#define ptr0 r29 321da177e4SLinus Torvalds#define ptr1 r28 331da177e4SLinus Torvalds#define ptr2 r27 341da177e4SLinus Torvalds#define ptr3 r26 351da177e4SLinus Torvalds#define ptr9 r24 361da177e4SLinus Torvalds#define loopcnt r23 371da177e4SLinus Torvalds#define linecnt r22 381da177e4SLinus Torvalds#define bytecnt r21 391da177e4SLinus Torvalds 401da177e4SLinus Torvalds#define fvalue f6 411da177e4SLinus Torvalds 421da177e4SLinus Torvalds// This routine uses only scratch predicate registers (p6 - p15) 431da177e4SLinus Torvalds#define p_scr p6 // default register for same-cycle branches 441da177e4SLinus Torvalds#define p_nz p7 451da177e4SLinus Torvalds#define p_zr p8 461da177e4SLinus Torvalds#define p_unalgn p9 471da177e4SLinus Torvalds#define p_y p11 481da177e4SLinus Torvalds#define p_n p12 491da177e4SLinus Torvalds#define p_yy p13 501da177e4SLinus Torvalds#define p_nn p14 511da177e4SLinus Torvalds 521da177e4SLinus Torvalds#define MIN1 15 531da177e4SLinus Torvalds#define MIN1P1HALF 8 541da177e4SLinus Torvalds#define LINE_SIZE 128 551da177e4SLinus Torvalds#define LSIZE_SH 7 // shift amount 561da177e4SLinus Torvalds#define PREF_AHEAD 8 571da177e4SLinus Torvalds 581da177e4SLinus TorvaldsGLOBAL_ENTRY(memset) 591da177e4SLinus Torvalds{ .mmi 601da177e4SLinus Torvalds .prologue 611da177e4SLinus Torvalds alloc tmp = ar.pfs, 3, 0, 0, 0 621da177e4SLinus Torvalds lfetch.nt1 [dest] // 631da177e4SLinus Torvalds .save ar.lc, save_lc 641da177e4SLinus Torvalds mov.i save_lc = ar.lc 659df6f705SDavid Mosberger-Tang .body 661da177e4SLinus Torvalds} { .mmi 671da177e4SLinus Torvalds mov ret0 = dest // return value 681da177e4SLinus Torvalds cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero 691da177e4SLinus Torvalds cmp.eq p_scr, p0 = cnt, r0 701da177e4SLinus Torvalds;; } 711da177e4SLinus Torvalds{ .mmi 721da177e4SLinus Torvalds and ptr2 = -(MIN1+1), dest // aligned address 731da177e4SLinus Torvalds and tmp = MIN1, dest // prepare to check for correct alignment 741da177e4SLinus Torvalds tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U) 751da177e4SLinus Torvalds} { .mib 761da177e4SLinus Torvalds mov ptr1 = dest 771da177e4SLinus Torvalds mux1 value = value, @brcst // create 8 identical bytes in word 781da177e4SLinus Torvalds(p_scr) br.ret.dpnt.many rp // return immediately if count = 0 791da177e4SLinus Torvalds;; } 801da177e4SLinus Torvalds{ .mib 811da177e4SLinus Torvalds cmp.ne p_unalgn, p0 = tmp, r0 // 821da177e4SLinus Torvalds} { .mib 831da177e4SLinus Torvalds sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt 841da177e4SLinus Torvalds cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task? 851da177e4SLinus Torvalds(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U) 861da177e4SLinus Torvalds;; } 871da177e4SLinus Torvalds{ .mmi 881da177e4SLinus Torvalds(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment 891da177e4SLinus Torvalds(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment 901da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ? 911da177e4SLinus Torvalds;; } 921da177e4SLinus Torvalds{ .mib 931da177e4SLinus Torvalds(p_y) add cnt = -8, cnt // 941da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ? 951da177e4SLinus Torvalds} { .mib 961da177e4SLinus Torvalds(p_y) st8 [ptr2] = value,-4 // 971da177e4SLinus Torvalds(p_n) add ptr2 = 4, ptr2 // 981da177e4SLinus Torvalds;; } 991da177e4SLinus Torvalds{ .mib 1001da177e4SLinus Torvalds(p_yy) add cnt = -4, cnt // 1011da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ? 1021da177e4SLinus Torvalds} { .mib 1031da177e4SLinus Torvalds(p_yy) st4 [ptr2] = value,-2 // 1041da177e4SLinus Torvalds(p_nn) add ptr2 = 2, ptr2 // 1051da177e4SLinus Torvalds;; } 1061da177e4SLinus Torvalds{ .mmi 1071da177e4SLinus Torvalds mov tmp = LINE_SIZE+1 // for compare 1081da177e4SLinus Torvalds(p_y) add cnt = -2, cnt // 1091da177e4SLinus Torvalds(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ? 1101da177e4SLinus Torvalds} { .mmi 1111da177e4SLinus Torvalds setf.sig fvalue=value // transfer value to FLP side 1121da177e4SLinus Torvalds(p_y) st2 [ptr2] = value,-1 // 1131da177e4SLinus Torvalds(p_n) add ptr2 = 1, ptr2 // 1141da177e4SLinus Torvalds;; } 1151da177e4SLinus Torvalds 1161da177e4SLinus Torvalds{ .mmi 1171da177e4SLinus Torvalds(p_yy) st1 [ptr2] = value // 1181da177e4SLinus Torvalds cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task? 1191da177e4SLinus Torvalds} { .mbb 1201da177e4SLinus Torvalds(p_yy) add cnt = -1, cnt // 1211da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few 1221da177e4SLinus Torvalds;; } 1231da177e4SLinus Torvalds 1241da177e4SLinus Torvalds{ .mib 1251da177e4SLinus Torvalds nop.m 0 1261da177e4SLinus Torvalds shr.u linecnt = cnt, LSIZE_SH 1271da177e4SLinus Torvalds(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill 1281da177e4SLinus Torvalds;; } 1291da177e4SLinus Torvalds 1301da177e4SLinus Torvalds TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later 1311da177e4SLinus Torvalds{ .mmi 1321da177e4SLinus Torvalds and tmp = -(LINE_SIZE), cnt // compute end of range 1331da177e4SLinus Torvalds mov ptr9 = ptr1 // used for prefetching 1341da177e4SLinus Torvalds and cnt = (LINE_SIZE-1), cnt // remainder 1351da177e4SLinus Torvalds} { .mmi 1361da177e4SLinus Torvalds mov loopcnt = PREF_AHEAD-1 // default prefetch loop 1371da177e4SLinus Torvalds cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value 1381da177e4SLinus Torvalds;; } 1391da177e4SLinus Torvalds{ .mmi 1401da177e4SLinus Torvalds(p_scr) add loopcnt = -1, linecnt // 1411da177e4SLinus Torvalds add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores) 1421da177e4SLinus Torvalds add ptr1 = tmp, ptr1 // first address beyond total range 1431da177e4SLinus Torvalds;; } 1441da177e4SLinus Torvalds{ .mmi 1451da177e4SLinus Torvalds add tmp = -1, linecnt // next loop count 1461da177e4SLinus Torvalds mov.i ar.lc = loopcnt // 1471da177e4SLinus Torvalds;; } 1481da177e4SLinus Torvalds.pref_l1a: 1491da177e4SLinus Torvalds{ .mib 1501da177e4SLinus Torvalds stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart 1511da177e4SLinus Torvalds nop.i 0 1521da177e4SLinus Torvalds br.cloop.dptk.few .pref_l1a 1531da177e4SLinus Torvalds;; } 1541da177e4SLinus Torvalds{ .mmi 1551da177e4SLinus Torvalds add ptr0 = 16, ptr2 // Two stores in parallel 1561da177e4SLinus Torvalds mov.i ar.lc = tmp // 1571da177e4SLinus Torvalds;; } 1581da177e4SLinus Torvalds.l1ax: 1591da177e4SLinus Torvalds { .mmi 1601da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 1611da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 8 1621da177e4SLinus Torvalds ;; } 1631da177e4SLinus Torvalds { .mmi 1641da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 1651da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 24 1661da177e4SLinus Torvalds ;; } 1671da177e4SLinus Torvalds { .mmi 1681da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 1691da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 8 1701da177e4SLinus Torvalds ;; } 1711da177e4SLinus Torvalds { .mmi 1721da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 1731da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 24 1741da177e4SLinus Torvalds ;; } 1751da177e4SLinus Torvalds { .mmi 1761da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 1771da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 8 1781da177e4SLinus Torvalds ;; } 1791da177e4SLinus Torvalds { .mmi 1801da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 1811da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 24 1821da177e4SLinus Torvalds ;; } 1831da177e4SLinus Torvalds { .mmi 1841da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 1851da177e4SLinus Torvalds stf8 [ptr0] = fvalue, 32 1861da177e4SLinus Torvalds cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? 1871da177e4SLinus Torvalds ;; } 1881da177e4SLinus Torvalds{ .mmb 1891da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 1901da177e4SLinus Torvalds(p_scr) stf8 [ptr9] = fvalue, 128 1911da177e4SLinus Torvalds br.cloop.dptk.few .l1ax 1921da177e4SLinus Torvalds;; } 1931da177e4SLinus Torvalds{ .mbb 1941da177e4SLinus Torvalds cmp.le p_scr, p0 = 8, cnt // just a few bytes left ? 1951da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2 1961da177e4SLinus Torvalds br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3 1971da177e4SLinus Torvalds;; } 1981da177e4SLinus Torvalds 1991da177e4SLinus Torvalds TEXT_ALIGN(32) 2001da177e4SLinus Torvalds.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later 2011da177e4SLinus Torvalds{ .mmi 2021da177e4SLinus Torvalds and tmp = -(LINE_SIZE), cnt // compute end of range 2031da177e4SLinus Torvalds mov ptr9 = ptr1 // used for prefetching 2041da177e4SLinus Torvalds and cnt = (LINE_SIZE-1), cnt // remainder 2051da177e4SLinus Torvalds} { .mmi 2061da177e4SLinus Torvalds mov loopcnt = PREF_AHEAD-1 // default prefetch loop 2071da177e4SLinus Torvalds cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value 2081da177e4SLinus Torvalds;; } 2091da177e4SLinus Torvalds{ .mmi 2101da177e4SLinus Torvalds(p_scr) add loopcnt = -1, linecnt 2111da177e4SLinus Torvalds add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores) 2121da177e4SLinus Torvalds add ptr1 = tmp, ptr1 // first address beyond total range 2131da177e4SLinus Torvalds;; } 2141da177e4SLinus Torvalds{ .mmi 2151da177e4SLinus Torvalds add tmp = -1, linecnt // next loop count 2161da177e4SLinus Torvalds mov.i ar.lc = loopcnt 2171da177e4SLinus Torvalds;; } 2181da177e4SLinus Torvalds.pref_l1b: 2191da177e4SLinus Torvalds{ .mib 2201da177e4SLinus Torvalds stf.spill [ptr9] = f0, 128 // Do stores one cache line apart 2211da177e4SLinus Torvalds nop.i 0 2221da177e4SLinus Torvalds br.cloop.dptk.few .pref_l1b 2231da177e4SLinus Torvalds;; } 2241da177e4SLinus Torvalds{ .mmi 2251da177e4SLinus Torvalds add ptr0 = 16, ptr2 // Two stores in parallel 2261da177e4SLinus Torvalds mov.i ar.lc = tmp 2271da177e4SLinus Torvalds;; } 2281da177e4SLinus Torvalds.l1bx: 2291da177e4SLinus Torvalds { .mmi 2301da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 2311da177e4SLinus Torvalds stf.spill [ptr0] = f0, 32 2321da177e4SLinus Torvalds ;; } 2331da177e4SLinus Torvalds { .mmi 2341da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 2351da177e4SLinus Torvalds stf.spill [ptr0] = f0, 32 2361da177e4SLinus Torvalds ;; } 2371da177e4SLinus Torvalds { .mmi 2381da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 2391da177e4SLinus Torvalds stf.spill [ptr0] = f0, 64 2401da177e4SLinus Torvalds cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? 2411da177e4SLinus Torvalds ;; } 2421da177e4SLinus Torvalds{ .mmb 2431da177e4SLinus Torvalds stf.spill [ptr2] = f0, 32 2441da177e4SLinus Torvalds(p_scr) stf.spill [ptr9] = f0, 128 2451da177e4SLinus Torvalds br.cloop.dptk.few .l1bx 2461da177e4SLinus Torvalds;; } 2471da177e4SLinus Torvalds{ .mib 2481da177e4SLinus Torvalds cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? 2491da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // 2501da177e4SLinus Torvalds;; } 2511da177e4SLinus Torvalds 2521da177e4SLinus Torvalds.fraction_of_line: 2531da177e4SLinus Torvalds{ .mib 2541da177e4SLinus Torvalds add ptr2 = 16, ptr1 2551da177e4SLinus Torvalds shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32 2561da177e4SLinus Torvalds;; } 2571da177e4SLinus Torvalds{ .mib 2581da177e4SLinus Torvalds cmp.eq p_scr, p0 = loopcnt, r0 2591da177e4SLinus Torvalds add loopcnt = -1, loopcnt 2601da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .store_words 2611da177e4SLinus Torvalds;; } 2621da177e4SLinus Torvalds{ .mib 2631da177e4SLinus Torvalds and cnt = 0x1f, cnt // compute the remaining cnt 2641da177e4SLinus Torvalds mov.i ar.lc = loopcnt 2651da177e4SLinus Torvalds;; } 2661da177e4SLinus Torvalds TEXT_ALIGN(32) 2671da177e4SLinus Torvalds.l2: // ------------------------------------ // L2A: store 32B in 2 cycles 2681da177e4SLinus Torvalds{ .mmb 2691da177e4SLinus Torvalds stf8 [ptr1] = fvalue, 8 2701da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 8 2711da177e4SLinus Torvalds;; } { .mmb 2721da177e4SLinus Torvalds stf8 [ptr1] = fvalue, 24 2731da177e4SLinus Torvalds stf8 [ptr2] = fvalue, 24 2741da177e4SLinus Torvalds br.cloop.dptk.many .l2 2751da177e4SLinus Torvalds;; } 2761da177e4SLinus Torvalds.store_words: 2771da177e4SLinus Torvalds{ .mib 2781da177e4SLinus Torvalds cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? 2791da177e4SLinus Torvalds(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch 2801da177e4SLinus Torvalds;; } 2811da177e4SLinus Torvalds 2821da177e4SLinus Torvalds{ .mmi 2831da177e4SLinus Torvalds stf8 [ptr1] = fvalue, 8 // store 2841da177e4SLinus Torvalds cmp.le p_y, p_n = 16, cnt 2851da177e4SLinus Torvalds add cnt = -8, cnt // subtract 2861da177e4SLinus Torvalds;; } 2871da177e4SLinus Torvalds{ .mmi 2881da177e4SLinus Torvalds(p_y) stf8 [ptr1] = fvalue, 8 // store 2891da177e4SLinus Torvalds(p_y) cmp.le.unc p_yy, p_nn = 16, cnt 2901da177e4SLinus Torvalds(p_y) add cnt = -8, cnt // subtract 2911da177e4SLinus Torvalds;; } 2921da177e4SLinus Torvalds{ .mmi // store 2931da177e4SLinus Torvalds(p_yy) stf8 [ptr1] = fvalue, 8 2941da177e4SLinus Torvalds(p_yy) add cnt = -8, cnt // subtract 2951da177e4SLinus Torvalds;; } 2961da177e4SLinus Torvalds 2971da177e4SLinus Torvalds.move_bytes_from_alignment: 2981da177e4SLinus Torvalds{ .mib 2991da177e4SLinus Torvalds cmp.eq p_scr, p0 = cnt, r0 3001da177e4SLinus Torvalds tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ? 3011da177e4SLinus Torvalds(p_scr) br.cond.dpnt.few .restore_and_exit 3021da177e4SLinus Torvalds;; } 3031da177e4SLinus Torvalds{ .mib 3041da177e4SLinus Torvalds(p_y) st4 [ptr1] = value,4 3051da177e4SLinus Torvalds tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ? 3061da177e4SLinus Torvalds;; } 3071da177e4SLinus Torvalds{ .mib 3081da177e4SLinus Torvalds(p_yy) st2 [ptr1] = value,2 3091da177e4SLinus Torvalds tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ? 3101da177e4SLinus Torvalds;; } 3111da177e4SLinus Torvalds 3121da177e4SLinus Torvalds{ .mib 3131da177e4SLinus Torvalds(p_y) st1 [ptr1] = value 3141da177e4SLinus Torvalds;; } 3151da177e4SLinus Torvalds.restore_and_exit: 3161da177e4SLinus Torvalds{ .mib 3171da177e4SLinus Torvalds nop.m 0 3181da177e4SLinus Torvalds mov.i ar.lc = save_lc 3191da177e4SLinus Torvalds br.ret.sptk.many rp 3201da177e4SLinus Torvalds;; } 3211da177e4SLinus Torvalds 3221da177e4SLinus Torvalds.move_bytes_unaligned: 3231da177e4SLinus Torvalds{ .mmi 3241da177e4SLinus Torvalds .pred.rel "mutex",p_y, p_n 3251da177e4SLinus Torvalds .pred.rel "mutex",p_yy, p_nn 3261da177e4SLinus Torvalds(p_n) cmp.le p_yy, p_nn = 4, cnt 3271da177e4SLinus Torvalds(p_y) cmp.le p_yy, p_nn = 5, cnt 3281da177e4SLinus Torvalds(p_n) add ptr2 = 2, ptr1 3291da177e4SLinus Torvalds} { .mmi 3301da177e4SLinus Torvalds(p_y) add ptr2 = 3, ptr1 3311da177e4SLinus Torvalds(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left] 3321da177e4SLinus Torvalds(p_y) add cnt = -1, cnt 3331da177e4SLinus Torvalds;; } 3341da177e4SLinus Torvalds{ .mmi 3351da177e4SLinus Torvalds(p_yy) cmp.le.unc p_y, p0 = 8, cnt 3361da177e4SLinus Torvalds add ptr3 = ptr1, cnt // prepare last store 3371da177e4SLinus Torvalds mov.i ar.lc = save_lc 3381da177e4SLinus Torvalds} { .mmi 3391da177e4SLinus Torvalds(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes 3401da177e4SLinus Torvalds(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left] 3411da177e4SLinus Torvalds(p_yy) add cnt = -4, cnt 3421da177e4SLinus Torvalds;; } 3431da177e4SLinus Torvalds{ .mmi 3441da177e4SLinus Torvalds(p_y) cmp.le.unc p_yy, p0 = 8, cnt 3451da177e4SLinus Torvalds add ptr3 = -1, ptr3 // last store 3461da177e4SLinus Torvalds tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ? 3471da177e4SLinus Torvalds} { .mmi 3481da177e4SLinus Torvalds(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes 3491da177e4SLinus Torvalds(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left] 3501da177e4SLinus Torvalds(p_y) add cnt = -4, cnt 3511da177e4SLinus Torvalds;; } 3521da177e4SLinus Torvalds{ .mmi 3531da177e4SLinus Torvalds(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes 3541da177e4SLinus Torvalds(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left] 3551da177e4SLinus Torvalds tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ? 3561da177e4SLinus Torvalds} { .mmi 3571da177e4SLinus Torvalds(p_yy) add cnt = -4, cnt 3581da177e4SLinus Torvalds;; } 3591da177e4SLinus Torvalds{ .mmb 3601da177e4SLinus Torvalds(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes 3611da177e4SLinus Torvalds(p_y) st1 [ptr3] = value // fill last byte (using ptr3) 3621da177e4SLinus Torvalds br.ret.sptk.many rp 3631da177e4SLinus Torvalds} 3641da177e4SLinus TorvaldsEND(memset) 365e007c533SAl ViroEXPORT_SYMBOL(memset) 366