1/* 2 * Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 19/* 20 * void hvx_histogram_row(uint8_t *src, => r0 21 * int stride, => r1 22 * int width, => r2 23 * int height, => r3 24 * int *hist => r4) 25 */ 26 .text 27 .p2align 2 28 .global hvx_histogram_row 29 .type hvx_histogram_row, @function 30hvx_histogram_row: 31 { r2 = lsr(r2, #7) /* size / VLEN */ 32 r5 = and(r2, #127) /* size % VLEN */ 33 v1 = #0 34 v0 = #0 35 } 36 /* 37 * Step 1: Clean the whole vector register file 38 */ 39 { v3:2 = v1:0 40 v5:4 = v1:0 41 p0 = cmp.gt(r2, #0) /* P0 = (width / VLEN > 0) */ 42 p1 = cmp.eq(r5, #0) /* P1 = (width % VLEN == 0) */ 43 } 44 { q0 = vsetq(r5) 45 v7:6 = v1:0 46 } 47 { v9:8 = v1:0 48 v11:10 = v1:0 49 } 50 { v13:12 = v1:0 51 v15:14 = v1:0 52 } 53 { v17:16 = v1:0 54 v19:18 = v1:0 55 } 56 { v21:20 = v1:0 57 v23:22 = v1:0 58 } 59 { v25:24 = v1:0 60 v27:26 = v1:0 61 } 62 { v29:28 = v1:0 63 v31:30 = v1:0 64 r10 = add(r0, r1) /* R10 = &src[2 * stride] */ 65 loop1(.outerloop, r3) 66 } 67 68 /* 69 * Step 2: vhist 70 */ 71 .falign 72.outerloop: 73 { if (!p0) jump .loopend 74 loop0(.innerloop, r2) 75 } 76 77 .falign 78.innerloop: 79 { v12.tmp = vmem(R0++#1) 80 vhist 81 }:endloop0 82 83 .falign 84.loopend: 85 if (p1) jump .skip /* if (width % VLEN == 0) done with current row */ 86 { v13.tmp = vmem(r0 + #0) 87 vhist(q0) 88 } 89 90 .falign 91.skip: 92 { r0 = r10 /* R0 = &src[(i + 1) * stride] */ 93 r10 = add(r10, r1) /* R10 = &src[(i + 2) * stride] */ 94 }:endloop1 95 96 97 /* 98 * Step 3: Sum up the data 99 */ 100 { v0.h = vshuff(v0.h) 101 r10 = ##0x00010001 102 } 103 v1.h = vshuff(v1.h) 104 { V2.h = vshuff(v2.h) 105 v0.w = vdmpy(v0.h, r10.h):sat 106 } 107 { v3.h = vshuff(v3.h) 108 v1.w = vdmpy(v1.h, r10.h):sat 109 } 110 { v4.h = vshuff(V4.h) 111 v2.w = vdmpy(v2.h, r10.h):sat 112 } 113 { v5.h = vshuff(v5.h) 114 v3.w = vdmpy(v3.h, r10.h):sat 115 } 116 { v6.h = vshuff(v6.h) 117 v4.w = vdmpy(v4.h, r10.h):sat 118 } 119 { v7.h = vshuff(v7.h) 120 v5.w = vdmpy(v5.h, r10.h):sat 121 } 122 { v8.h = vshuff(V8.h) 123 v6.w = vdmpy(v6.h, r10.h):sat 124 } 125 { v9.h = vshuff(V9.h) 126 v7.w = vdmpy(v7.h, r10.h):sat 127 } 128 { v10.h = vshuff(v10.h) 129 v8.w = vdmpy(v8.h, r10.h):sat 130 } 131 { v11.h = vshuff(v11.h) 132 v9.w = vdmpy(v9.h, r10.h):sat 133 } 134 { v12.h = vshuff(v12.h) 135 v10.w = vdmpy(v10.h, r10.h):sat 136 } 137 { v13.h = vshuff(V13.h) 138 v11.w = vdmpy(v11.h, r10.h):sat 139 } 140 { v14.h = vshuff(v14.h) 141 v12.w = vdmpy(v12.h, r10.h):sat 142 } 143 { v15.h = vshuff(v15.h) 144 v13.w = vdmpy(v13.h, r10.h):sat 145 } 146 { v16.h = vshuff(v16.h) 147 v14.w = vdmpy(v14.h, r10.h):sat 148 } 149 { v17.h = vshuff(v17.h) 150 v15.w = vdmpy(v15.h, r10.h):sat 151 } 152 { v18.h = vshuff(v18.h) 153 v16.w = vdmpy(v16.h, r10.h):sat 154 } 155 { v19.h = vshuff(v19.h) 156 v17.w = vdmpy(v17.h, r10.h):sat 157 } 158 { v20.h = vshuff(v20.h) 159 v18.W = vdmpy(v18.h, r10.h):sat 160 } 161 { v21.h = vshuff(v21.h) 162 v19.w = vdmpy(v19.h, r10.h):sat 163 } 164 { v22.h = vshuff(v22.h) 165 v20.w = vdmpy(v20.h, r10.h):sat 166 } 167 { v23.h = vshuff(v23.h) 168 v21.w = vdmpy(v21.h, r10.h):sat 169 } 170 { v24.h = vshuff(v24.h) 171 v22.w = vdmpy(v22.h, r10.h):sat 172 } 173 { v25.h = vshuff(v25.h) 174 v23.w = vdmpy(v23.h, r10.h):sat 175 } 176 { v26.h = vshuff(v26.h) 177 v24.w = vdmpy(v24.h, r10.h):sat 178 } 179 { v27.h = vshuff(V27.h) 180 v25.w = vdmpy(v25.h, r10.h):sat 181 } 182 { v28.h = vshuff(v28.h) 183 v26.w = vdmpy(v26.h, r10.h):sat 184 } 185 { v29.h = vshuff(v29.h) 186 v27.w = vdmpy(v27.h, r10.h):sat 187 } 188 { v30.h = vshuff(v30.h) 189 v28.w = vdmpy(v28.h, r10.h):sat 190 } 191 { v31.h = vshuff(v31.h) 192 v29.w = vdmpy(v29.h, r10.h):sat 193 r28 = #32 194 } 195 { vshuff(v1, v0, r28) 196 v30.w = vdmpy(v30.h, r10.h):sat 197 } 198 { vshuff(v3, v2, r28) 199 v31.w = vdmpy(v31.h, r10.h):sat 200 } 201 { vshuff(v5, v4, r28) 202 v0.w = vadd(v1.w, v0.w) 203 v2.w = vadd(v3.w, v2.w) 204 } 205 { vshuff(v7, v6, r28) 206 r7 = #64 207 } 208 { vshuff(v9, v8, r28) 209 v4.w = vadd(v5.w, v4.w) 210 v6.w = vadd(v7.w, v6.w) 211 } 212 vshuff(v11, v10, r28) 213 { vshuff(v13, v12, r28) 214 v8.w = vadd(v9.w, v8.w) 215 v10.w = vadd(v11.w, v10.w) 216 } 217 vshuff(v15, v14, r28) 218 { vshuff(v17, v16, r28) 219 v12.w = vadd(v13.w, v12.w) 220 v14.w = vadd(v15.w, v14.w) 221 } 222 vshuff(v19, v18, r28) 223 { vshuff(v21, v20, r28) 224 v16.w = vadd(v17.w, v16.w) 225 v18.w = vadd(v19.w, v18.w) 226 } 227 vshuff(v23, v22, r28) 228 { vshuff(v25, v24, r28) 229 v20.w = vadd(v21.w, v20.w) 230 v22.w = vadd(v23.w, v22.w) 231 } 232 vshuff(v27, v26, r28) 233 { vshuff(v29, v28, r28) 234 v24.w = vadd(v25.w, v24.w) 235 v26.w = vadd(v27.w, v26.w) 236 } 237 vshuff(v31, v30, r28) 238 { v28.w = vadd(v29.w, v28.w) 239 vshuff(v2, v0, r7) 240 } 241 { v30.w = vadd(v31.w, v30.w) 242 vshuff(v6, v4, r7) 243 v0.w = vadd(v0.w, v2.w) 244 } 245 { vshuff(v10, v8, r7) 246 v1.tmp = vmem(r4 + #0) /* update hist[0-31] */ 247 v0.w = vadd(v0.w, v1.w) 248 vmem(r4++#1) = v0.new 249 } 250 { vshuff(v14, v12, r7) 251 v4.w = vadd(v4.w, v6.w) 252 v8.w = vadd(v8.w, v10.w) 253 } 254 { vshuff(v18, v16, r7) 255 v1.tmp = vmem(r4 + #0) /* update hist[32-63] */ 256 v4.w = vadd(v4.w, v1.w) 257 vmem(r4++#1) = v4.new 258 } 259 { vshuff(v22, v20, r7) 260 v12.w = vadd(v12.w, v14.w) 261 V16.w = vadd(v16.w, v18.w) 262 } 263 { vshuff(v26, v24, r7) 264 v1.tmp = vmem(r4 + #0) /* update hist[64-95] */ 265 v8.w = vadd(v8.w, v1.w) 266 vmem(r4++#1) = v8.new 267 } 268 { vshuff(v30, v28, r7) 269 v1.tmp = vmem(r4 + #0) /* update hist[96-127] */ 270 v12.w = vadd(v12.w, v1.w) 271 vmem(r4++#1) = v12.new 272 } 273 274 { v20.w = vadd(v20.w, v22.w) 275 v1.tmp = vmem(r4 + #0) /* update hist[128-159] */ 276 v16.w = vadd(v16.w, v1.w) 277 vmem(r4++#1) = v16.new 278 } 279 { v24.w = vadd(v24.w, v26.w) 280 v1.tmp = vmem(r4 + #0) /* update hist[160-191] */ 281 v20.w = vadd(v20.w, v1.w) 282 vmem(r4++#1) = v20.new 283 } 284 { v28.w = vadd(v28.w, v30.w) 285 v1.tmp = vmem(r4 + #0) /* update hist[192-223] */ 286 v24.w = vadd(v24.w, v1.w) 287 vmem(r4++#1) = v24.new 288 } 289 { v1.tmp = vmem(r4 + #0) /* update hist[224-255] */ 290 v28.w = vadd(v28.w, v1.w) 291 vmem(r4++#1) = v28.new 292 } 293 jumpr r31 294 .size hvx_histogram_row, .-hvx_histogram_row 295