1/* 2 * Copyright (c) 2011 Code Aurora Forum. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 and 6 * only version 2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 16 * 02110-1301, USA. 17 */ 18 19 20/* HEXAGON assembly optimized memset */ 21/* Replaces the standard library function memset */ 22 23 24 .macro HEXAGON_OPT_FUNC_BEGIN name 25 .text 26 .p2align 4 27 .globl \name 28 .type \name, @function 29\name: 30 .endm 31 32 .macro HEXAGON_OPT_FUNC_FINISH name 33 .size \name, . - \name 34 .endm 35 36/* FUNCTION: memset (v2 version) */ 37#if __HEXAGON_ARCH__ < 3 38HEXAGON_OPT_FUNC_BEGIN memset 39 { 40 r6 = #8 41 r7 = extractu(r0, #3 , #0) 42 p0 = cmp.eq(r2, #0) 43 p1 = cmp.gtu(r2, #7) 44 } 45 { 46 r4 = vsplatb(r1) 47 r8 = r0 /* leave r0 intact for return val */ 48 r9 = sub(r6, r7) /* bytes until double alignment */ 49 if p0 jumpr r31 /* count == 0, so return */ 50 } 51 { 52 r3 = #0 53 r7 = #0 54 p0 = tstbit(r9, #0) 55 if p1 jump 2f /* skip byte loop */ 56 } 57 58/* less than 8 bytes to set, so just set a byte at a time and return */ 59 60 loop0(1f, r2) /* byte loop */ 61 .falign 621: /* byte loop */ 63 { 64 memb(r8++#1) = r4 65 }:endloop0 66 jumpr r31 67 .falign 682: /* skip byte loop */ 69 { 70 r6 = #1 71 p0 = tstbit(r9, #1) 72 p1 = cmp.eq(r2, #1) 73 if !p0 jump 3f /* skip initial byte store */ 74 } 75 { 76 memb(r8++#1) = r4 77 r3:2 = sub(r3:2, r7:6) 78 if p1 jumpr r31 79 } 80 .falign 813: /* skip initial byte store */ 82 { 83 r6 = #2 84 p0 = tstbit(r9, #2) 85 p1 = cmp.eq(r2, #2) 86 if !p0 jump 4f /* skip initial half store */ 87 } 88 { 89 memh(r8++#2) = r4 90 r3:2 = sub(r3:2, r7:6) 91 if p1 jumpr r31 92 } 93 .falign 944: /* skip initial half store */ 95 { 96 r6 = #4 97 p0 = cmp.gtu(r2, #7) 98 p1 = cmp.eq(r2, #4) 99 if !p0 jump 5f /* skip initial word store */ 100 } 101 { 102 memw(r8++#4) = r4 103 r3:2 = sub(r3:2, r7:6) 104 p0 = cmp.gtu(r2, #11) 105 if p1 jumpr r31 106 } 107 .falign 1085: /* skip initial word store */ 109 { 110 r10 = lsr(r2, #3) 111 p1 = cmp.eq(r3, #1) 112 if !p0 jump 7f /* skip double loop */ 113 } 114 { 115 r5 = r4 116 r6 = #8 117 loop0(6f, r10) /* double loop */ 118 } 119 120/* set bytes a double word at a time */ 121 122 .falign 1236: /* double loop */ 124 { 125 memd(r8++#8) = r5:4 126 r3:2 = sub(r3:2, r7:6) 127 p1 = cmp.eq(r2, #8) 128 }:endloop0 129 .falign 1307: /* skip double loop */ 131 { 132 p0 = tstbit(r2, #2) 133 if p1 jumpr r31 134 } 135 { 136 r6 = #4 137 p0 = tstbit(r2, #1) 138 p1 = cmp.eq(r2, #4) 139 if !p0 jump 8f /* skip final word store */ 140 } 141 { 142 memw(r8++#4) = r4 143 r3:2 = sub(r3:2, r7:6) 144 if p1 jumpr r31 145 } 146 .falign 1478: /* skip final word store */ 148 { 149 p1 = cmp.eq(r2, #2) 150 if !p0 jump 9f /* skip final half store */ 151 } 152 { 153 memh(r8++#2) = r4 154 if p1 jumpr r31 155 } 156 .falign 1579: /* skip final half store */ 158 { 159 memb(r8++#1) = r4 160 jumpr r31 161 } 162HEXAGON_OPT_FUNC_FINISH memset 163#endif 164 165 166/* FUNCTION: memset (v3 and higher version) */ 167#if __HEXAGON_ARCH__ >= 3 168HEXAGON_OPT_FUNC_BEGIN memset 169 { 170 r7=vsplatb(r1) 171 r6 = r0 172 if (r2==#0) jump:nt .L1 173 } 174 { 175 r5:4=combine(r7,r7) 176 p0 = cmp.gtu(r2,#8) 177 if (p0.new) jump:nt .L3 178 } 179 { 180 r3 = r0 181 loop0(.L47,r2) 182 } 183 .falign 184.L47: 185 { 186 memb(r3++#1) = r1 187 }:endloop0 /* start=.L47 */ 188 jumpr r31 189.L3: 190 { 191 p0 = tstbit(r0,#0) 192 if (!p0.new) jump:nt .L8 193 p1 = cmp.eq(r2, #1) 194 } 195 { 196 r6 = add(r0, #1) 197 r2 = add(r2,#-1) 198 memb(r0) = r1 199 if (p1) jump .L1 200 } 201.L8: 202 { 203 p0 = tstbit(r6,#1) 204 if (!p0.new) jump:nt .L10 205 } 206 { 207 r2 = add(r2,#-2) 208 memh(r6++#2) = r7 209 p0 = cmp.eq(r2, #2) 210 if (p0.new) jump:nt .L1 211 } 212.L10: 213 { 214 p0 = tstbit(r6,#2) 215 if (!p0.new) jump:nt .L12 216 } 217 { 218 r2 = add(r2,#-4) 219 memw(r6++#4) = r7 220 p0 = cmp.eq(r2, #4) 221 if (p0.new) jump:nt .L1 222 } 223.L12: 224 { 225 p0 = cmp.gtu(r2,#127) 226 if (!p0.new) jump:nt .L14 227 } 228 r3 = and(r6,#31) 229 if (r3==#0) jump:nt .L17 230 { 231 memd(r6++#8) = r5:4 232 r2 = add(r2,#-8) 233 } 234 r3 = and(r6,#31) 235 if (r3==#0) jump:nt .L17 236 { 237 memd(r6++#8) = r5:4 238 r2 = add(r2,#-8) 239 } 240 r3 = and(r6,#31) 241 if (r3==#0) jump:nt .L17 242 { 243 memd(r6++#8) = r5:4 244 r2 = add(r2,#-8) 245 } 246.L17: 247 { 248 r3 = lsr(r2,#5) 249 if (r1!=#0) jump:nt .L18 250 } 251 { 252 r8 = r3 253 r3 = r6 254 loop0(.L46,r3) 255 } 256 .falign 257.L46: 258 { 259 dczeroa(r6) 260 r6 = add(r6,#32) 261 r2 = add(r2,#-32) 262 }:endloop0 /* start=.L46 */ 263.L14: 264 { 265 p0 = cmp.gtu(r2,#7) 266 if (!p0.new) jump:nt .L28 267 r8 = lsr(r2,#3) 268 } 269 loop0(.L44,r8) 270 .falign 271.L44: 272 { 273 memd(r6++#8) = r5:4 274 r2 = add(r2,#-8) 275 }:endloop0 /* start=.L44 */ 276.L28: 277 { 278 p0 = tstbit(r2,#2) 279 if (!p0.new) jump:nt .L33 280 } 281 { 282 r2 = add(r2,#-4) 283 memw(r6++#4) = r7 284 } 285.L33: 286 { 287 p0 = tstbit(r2,#1) 288 if (!p0.new) jump:nt .L35 289 } 290 { 291 r2 = add(r2,#-2) 292 memh(r6++#2) = r7 293 } 294.L35: 295 p0 = cmp.eq(r2,#1) 296 if (p0) memb(r6) = r1 297.L1: 298 jumpr r31 299.L18: 300 loop0(.L45,r3) 301 .falign 302.L45: 303 dczeroa(r6) 304 { 305 memd(r6++#8) = r5:4 306 r2 = add(r2,#-32) 307 } 308 memd(r6++#8) = r5:4 309 memd(r6++#8) = r5:4 310 { 311 memd(r6++#8) = r5:4 312 }:endloop0 /* start=.L45 */ 313 jump .L14 314HEXAGON_OPT_FUNC_FINISH memset 315#endif 316