49278c1b0d
Acked-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
295 lines
7.2 KiB
ArmAsm
295 lines
7.2 KiB
ArmAsm
/*
|
|
* Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
|
|
/*
|
|
* void hvx_histogram_row(uint8_t *src, => r0
|
|
* int stride, => r1
|
|
* int width, => r2
|
|
* int height, => r3
|
|
* int *hist => r4)
|
|
*/
|
|
.text
|
|
.p2align 2
|
|
.global hvx_histogram_row
|
|
.type hvx_histogram_row, @function
|
|
hvx_histogram_row:
|
|
{ r2 = lsr(r2, #7) /* size / VLEN */
|
|
r5 = and(r2, #127) /* size % VLEN */
|
|
v1 = #0
|
|
v0 = #0
|
|
}
|
|
/*
|
|
* Step 1: Clean the whole vector register file
|
|
*/
|
|
{ v3:2 = v1:0
|
|
v5:4 = v1:0
|
|
p0 = cmp.gt(r2, #0) /* P0 = (width / VLEN > 0) */
|
|
p1 = cmp.eq(r5, #0) /* P1 = (width % VLEN == 0) */
|
|
}
|
|
{ q0 = vsetq(r5)
|
|
v7:6 = v1:0
|
|
}
|
|
{ v9:8 = v1:0
|
|
v11:10 = v1:0
|
|
}
|
|
{ v13:12 = v1:0
|
|
v15:14 = v1:0
|
|
}
|
|
{ v17:16 = v1:0
|
|
v19:18 = v1:0
|
|
}
|
|
{ v21:20 = v1:0
|
|
v23:22 = v1:0
|
|
}
|
|
{ v25:24 = v1:0
|
|
v27:26 = v1:0
|
|
}
|
|
{ v29:28 = v1:0
|
|
v31:30 = v1:0
|
|
r10 = add(r0, r1) /* R10 = &src[2 * stride] */
|
|
loop1(.outerloop, r3)
|
|
}
|
|
|
|
/*
|
|
* Step 2: vhist
|
|
*/
|
|
.falign
|
|
.outerloop:
|
|
{ if (!p0) jump .loopend
|
|
loop0(.innerloop, r2)
|
|
}
|
|
|
|
.falign
|
|
.innerloop:
|
|
{ v12.tmp = vmem(R0++#1)
|
|
vhist
|
|
}:endloop0
|
|
|
|
.falign
|
|
.loopend:
|
|
if (p1) jump .skip /* if (width % VLEN == 0) done with current row */
|
|
{ v13.tmp = vmem(r0 + #0)
|
|
vhist(q0)
|
|
}
|
|
|
|
.falign
|
|
.skip:
|
|
{ r0 = r10 /* R0 = &src[(i + 1) * stride] */
|
|
r10 = add(r10, r1) /* R10 = &src[(i + 2) * stride] */
|
|
}:endloop1
|
|
|
|
|
|
/*
|
|
* Step 3: Sum up the data
|
|
*/
|
|
{ v0.h = vshuff(v0.h)
|
|
r10 = ##0x00010001
|
|
}
|
|
v1.h = vshuff(v1.h)
|
|
{ V2.h = vshuff(v2.h)
|
|
v0.w = vdmpy(v0.h, r10.h):sat
|
|
}
|
|
{ v3.h = vshuff(v3.h)
|
|
v1.w = vdmpy(v1.h, r10.h):sat
|
|
}
|
|
{ v4.h = vshuff(V4.h)
|
|
v2.w = vdmpy(v2.h, r10.h):sat
|
|
}
|
|
{ v5.h = vshuff(v5.h)
|
|
v3.w = vdmpy(v3.h, r10.h):sat
|
|
}
|
|
{ v6.h = vshuff(v6.h)
|
|
v4.w = vdmpy(v4.h, r10.h):sat
|
|
}
|
|
{ v7.h = vshuff(v7.h)
|
|
v5.w = vdmpy(v5.h, r10.h):sat
|
|
}
|
|
{ v8.h = vshuff(V8.h)
|
|
v6.w = vdmpy(v6.h, r10.h):sat
|
|
}
|
|
{ v9.h = vshuff(V9.h)
|
|
v7.w = vdmpy(v7.h, r10.h):sat
|
|
}
|
|
{ v10.h = vshuff(v10.h)
|
|
v8.w = vdmpy(v8.h, r10.h):sat
|
|
}
|
|
{ v11.h = vshuff(v11.h)
|
|
v9.w = vdmpy(v9.h, r10.h):sat
|
|
}
|
|
{ v12.h = vshuff(v12.h)
|
|
v10.w = vdmpy(v10.h, r10.h):sat
|
|
}
|
|
{ v13.h = vshuff(V13.h)
|
|
v11.w = vdmpy(v11.h, r10.h):sat
|
|
}
|
|
{ v14.h = vshuff(v14.h)
|
|
v12.w = vdmpy(v12.h, r10.h):sat
|
|
}
|
|
{ v15.h = vshuff(v15.h)
|
|
v13.w = vdmpy(v13.h, r10.h):sat
|
|
}
|
|
{ v16.h = vshuff(v16.h)
|
|
v14.w = vdmpy(v14.h, r10.h):sat
|
|
}
|
|
{ v17.h = vshuff(v17.h)
|
|
v15.w = vdmpy(v15.h, r10.h):sat
|
|
}
|
|
{ v18.h = vshuff(v18.h)
|
|
v16.w = vdmpy(v16.h, r10.h):sat
|
|
}
|
|
{ v19.h = vshuff(v19.h)
|
|
v17.w = vdmpy(v17.h, r10.h):sat
|
|
}
|
|
{ v20.h = vshuff(v20.h)
|
|
v18.W = vdmpy(v18.h, r10.h):sat
|
|
}
|
|
{ v21.h = vshuff(v21.h)
|
|
v19.w = vdmpy(v19.h, r10.h):sat
|
|
}
|
|
{ v22.h = vshuff(v22.h)
|
|
v20.w = vdmpy(v20.h, r10.h):sat
|
|
}
|
|
{ v23.h = vshuff(v23.h)
|
|
v21.w = vdmpy(v21.h, r10.h):sat
|
|
}
|
|
{ v24.h = vshuff(v24.h)
|
|
v22.w = vdmpy(v22.h, r10.h):sat
|
|
}
|
|
{ v25.h = vshuff(v25.h)
|
|
v23.w = vdmpy(v23.h, r10.h):sat
|
|
}
|
|
{ v26.h = vshuff(v26.h)
|
|
v24.w = vdmpy(v24.h, r10.h):sat
|
|
}
|
|
{ v27.h = vshuff(V27.h)
|
|
v25.w = vdmpy(v25.h, r10.h):sat
|
|
}
|
|
{ v28.h = vshuff(v28.h)
|
|
v26.w = vdmpy(v26.h, r10.h):sat
|
|
}
|
|
{ v29.h = vshuff(v29.h)
|
|
v27.w = vdmpy(v27.h, r10.h):sat
|
|
}
|
|
{ v30.h = vshuff(v30.h)
|
|
v28.w = vdmpy(v28.h, r10.h):sat
|
|
}
|
|
{ v31.h = vshuff(v31.h)
|
|
v29.w = vdmpy(v29.h, r10.h):sat
|
|
r28 = #32
|
|
}
|
|
{ vshuff(v1, v0, r28)
|
|
v30.w = vdmpy(v30.h, r10.h):sat
|
|
}
|
|
{ vshuff(v3, v2, r28)
|
|
v31.w = vdmpy(v31.h, r10.h):sat
|
|
}
|
|
{ vshuff(v5, v4, r28)
|
|
v0.w = vadd(v1.w, v0.w)
|
|
v2.w = vadd(v3.w, v2.w)
|
|
}
|
|
{ vshuff(v7, v6, r28)
|
|
r7 = #64
|
|
}
|
|
{ vshuff(v9, v8, r28)
|
|
v4.w = vadd(v5.w, v4.w)
|
|
v6.w = vadd(v7.w, v6.w)
|
|
}
|
|
vshuff(v11, v10, r28)
|
|
{ vshuff(v13, v12, r28)
|
|
v8.w = vadd(v9.w, v8.w)
|
|
v10.w = vadd(v11.w, v10.w)
|
|
}
|
|
vshuff(v15, v14, r28)
|
|
{ vshuff(v17, v16, r28)
|
|
v12.w = vadd(v13.w, v12.w)
|
|
v14.w = vadd(v15.w, v14.w)
|
|
}
|
|
vshuff(v19, v18, r28)
|
|
{ vshuff(v21, v20, r28)
|
|
v16.w = vadd(v17.w, v16.w)
|
|
v18.w = vadd(v19.w, v18.w)
|
|
}
|
|
vshuff(v23, v22, r28)
|
|
{ vshuff(v25, v24, r28)
|
|
v20.w = vadd(v21.w, v20.w)
|
|
v22.w = vadd(v23.w, v22.w)
|
|
}
|
|
vshuff(v27, v26, r28)
|
|
{ vshuff(v29, v28, r28)
|
|
v24.w = vadd(v25.w, v24.w)
|
|
v26.w = vadd(v27.w, v26.w)
|
|
}
|
|
vshuff(v31, v30, r28)
|
|
{ v28.w = vadd(v29.w, v28.w)
|
|
vshuff(v2, v0, r7)
|
|
}
|
|
{ v30.w = vadd(v31.w, v30.w)
|
|
vshuff(v6, v4, r7)
|
|
v0.w = vadd(v0.w, v2.w)
|
|
}
|
|
{ vshuff(v10, v8, r7)
|
|
v1.tmp = vmem(r4 + #0) /* update hist[0-31] */
|
|
v0.w = vadd(v0.w, v1.w)
|
|
vmem(r4++#1) = v0.new
|
|
}
|
|
{ vshuff(v14, v12, r7)
|
|
v4.w = vadd(v4.w, v6.w)
|
|
v8.w = vadd(v8.w, v10.w)
|
|
}
|
|
{ vshuff(v18, v16, r7)
|
|
v1.tmp = vmem(r4 + #0) /* update hist[32-63] */
|
|
v4.w = vadd(v4.w, v1.w)
|
|
vmem(r4++#1) = v4.new
|
|
}
|
|
{ vshuff(v22, v20, r7)
|
|
v12.w = vadd(v12.w, v14.w)
|
|
V16.w = vadd(v16.w, v18.w)
|
|
}
|
|
{ vshuff(v26, v24, r7)
|
|
v1.tmp = vmem(r4 + #0) /* update hist[64-95] */
|
|
v8.w = vadd(v8.w, v1.w)
|
|
vmem(r4++#1) = v8.new
|
|
}
|
|
{ vshuff(v30, v28, r7)
|
|
v1.tmp = vmem(r4 + #0) /* update hist[96-127] */
|
|
v12.w = vadd(v12.w, v1.w)
|
|
vmem(r4++#1) = v12.new
|
|
}
|
|
|
|
{ v20.w = vadd(v20.w, v22.w)
|
|
v1.tmp = vmem(r4 + #0) /* update hist[128-159] */
|
|
v16.w = vadd(v16.w, v1.w)
|
|
vmem(r4++#1) = v16.new
|
|
}
|
|
{ v24.w = vadd(v24.w, v26.w)
|
|
v1.tmp = vmem(r4 + #0) /* update hist[160-191] */
|
|
v20.w = vadd(v20.w, v1.w)
|
|
vmem(r4++#1) = v20.new
|
|
}
|
|
{ v28.w = vadd(v28.w, v30.w)
|
|
v1.tmp = vmem(r4 + #0) /* update hist[192-223] */
|
|
v24.w = vadd(v24.w, v1.w)
|
|
vmem(r4++#1) = v24.new
|
|
}
|
|
{ v1.tmp = vmem(r4 + #0) /* update hist[224-255] */
|
|
v28.w = vadd(v28.w, v1.w)
|
|
vmem(r4++#1) = v28.new
|
|
}
|
|
jumpr r31
|
|
.size hvx_histogram_row, .-hvx_histogram_row
|