1dacca5f0SHans Verkuil // SPDX-License-Identifier: LGPL-2.1+
2dacca5f0SHans Verkuil /*
3dacca5f0SHans Verkuil * Copyright 2016 Tom aan de Wiel
4dacca5f0SHans Verkuil * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5dacca5f0SHans Verkuil *
6dacca5f0SHans Verkuil * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7dacca5f0SHans Verkuil *
8dacca5f0SHans Verkuil * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9dacca5f0SHans Verkuil * R.D. Brown, 1977
10dacca5f0SHans Verkuil */
11dacca5f0SHans Verkuil
12dacca5f0SHans Verkuil #include <linux/string.h>
13dacca5f0SHans Verkuil #include <linux/kernel.h>
14*206bc0f6SHans Verkuil #include <linux/videodev2.h>
15dacca5f0SHans Verkuil #include "codec-fwht.h"
16dacca5f0SHans Verkuil
17dacca5f0SHans Verkuil #define OVERFLOW_BIT BIT(14)
18dacca5f0SHans Verkuil
19dacca5f0SHans Verkuil /*
20dacca5f0SHans Verkuil * Note: bit 0 of the header must always be 0. Otherwise it cannot
21dacca5f0SHans Verkuil * be guaranteed that the magic 8 byte sequence (see below) can
22dacca5f0SHans Verkuil * never occur in the rlc output.
23dacca5f0SHans Verkuil */
24dacca5f0SHans Verkuil #define PFRAME_BIT BIT(15)
25dacca5f0SHans Verkuil #define DUPS_MASK 0x1ffe
26dacca5f0SHans Verkuil
27dacca5f0SHans Verkuil #define PBLOCK 0
28dacca5f0SHans Verkuil #define IBLOCK 1
29dacca5f0SHans Verkuil
30dacca5f0SHans Verkuil #define ALL_ZEROS 15
31dacca5f0SHans Verkuil
32dacca5f0SHans Verkuil static const uint8_t zigzag[64] = {
33dacca5f0SHans Verkuil 0,
34dacca5f0SHans Verkuil 1, 8,
35dacca5f0SHans Verkuil 2, 9, 16,
36dacca5f0SHans Verkuil 3, 10, 17, 24,
37dacca5f0SHans Verkuil 4, 11, 18, 25, 32,
38dacca5f0SHans Verkuil 5, 12, 19, 26, 33, 40,
39dacca5f0SHans Verkuil 6, 13, 20, 27, 34, 41, 48,
40dacca5f0SHans Verkuil 7, 14, 21, 28, 35, 42, 49, 56,
41dacca5f0SHans Verkuil 15, 22, 29, 36, 43, 50, 57,
42dacca5f0SHans Verkuil 23, 30, 37, 44, 51, 58,
43dacca5f0SHans Verkuil 31, 38, 45, 52, 59,
44dacca5f0SHans Verkuil 39, 46, 53, 60,
45dacca5f0SHans Verkuil 47, 54, 61,
46dacca5f0SHans Verkuil 55, 62,
47dacca5f0SHans Verkuil 63,
48dacca5f0SHans Verkuil };
49dacca5f0SHans Verkuil
50dacca5f0SHans Verkuil /*
51dacca5f0SHans Verkuil * noinline_for_stack to work around
52dacca5f0SHans Verkuil * https://bugs.llvm.org/show_bug.cgi?id=38809
53dacca5f0SHans Verkuil */
54dacca5f0SHans Verkuil static int noinline_for_stack
rlc(const s16 * in,__be16 * output,int blocktype)55dacca5f0SHans Verkuil rlc(const s16 *in, __be16 *output, int blocktype)
56dacca5f0SHans Verkuil {
57dacca5f0SHans Verkuil s16 block[8 * 8];
58dacca5f0SHans Verkuil s16 *wp = block;
59dacca5f0SHans Verkuil int i = 0;
60dacca5f0SHans Verkuil int x, y;
61dacca5f0SHans Verkuil int ret = 0;
62dacca5f0SHans Verkuil
63dacca5f0SHans Verkuil /* read in block from framebuffer */
64dacca5f0SHans Verkuil int lastzero_run = 0;
65dacca5f0SHans Verkuil int to_encode;
66dacca5f0SHans Verkuil
67dacca5f0SHans Verkuil for (y = 0; y < 8; y++) {
68dacca5f0SHans Verkuil for (x = 0; x < 8; x++) {
69dacca5f0SHans Verkuil *wp = in[x + y * 8];
70dacca5f0SHans Verkuil wp++;
71dacca5f0SHans Verkuil }
72dacca5f0SHans Verkuil }
73dacca5f0SHans Verkuil
74dacca5f0SHans Verkuil /* keep track of amount of trailing zeros */
75dacca5f0SHans Verkuil for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
76dacca5f0SHans Verkuil lastzero_run++;
77dacca5f0SHans Verkuil
78dacca5f0SHans Verkuil *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
79dacca5f0SHans Verkuil ret++;
80dacca5f0SHans Verkuil
81dacca5f0SHans Verkuil to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
82dacca5f0SHans Verkuil
83dacca5f0SHans Verkuil i = 0;
84dacca5f0SHans Verkuil while (i < to_encode) {
85dacca5f0SHans Verkuil int cnt = 0;
86dacca5f0SHans Verkuil int tmp;
87dacca5f0SHans Verkuil
88dacca5f0SHans Verkuil /* count leading zeros */
89dacca5f0SHans Verkuil while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
90dacca5f0SHans Verkuil cnt++;
91dacca5f0SHans Verkuil i++;
92dacca5f0SHans Verkuil if (i == to_encode) {
93dacca5f0SHans Verkuil cnt--;
94dacca5f0SHans Verkuil break;
95dacca5f0SHans Verkuil }
96dacca5f0SHans Verkuil }
97dacca5f0SHans Verkuil /* 4 bits for run, 12 for coefficient (quantization by 4) */
98dacca5f0SHans Verkuil *output++ = htons((cnt | tmp << 4));
99dacca5f0SHans Verkuil i++;
100dacca5f0SHans Verkuil ret++;
101dacca5f0SHans Verkuil }
102dacca5f0SHans Verkuil if (lastzero_run > 14) {
103dacca5f0SHans Verkuil *output = htons(ALL_ZEROS | 0);
104dacca5f0SHans Verkuil ret++;
105dacca5f0SHans Verkuil }
106dacca5f0SHans Verkuil
107dacca5f0SHans Verkuil return ret;
108dacca5f0SHans Verkuil }
109dacca5f0SHans Verkuil
110dacca5f0SHans Verkuil /*
111dacca5f0SHans Verkuil * This function will worst-case increase rlc_in by 65*2 bytes:
112dacca5f0SHans Verkuil * one s16 value for the header and 8 * 8 coefficients of type s16.
113dacca5f0SHans Verkuil */
114dacca5f0SHans Verkuil static noinline_for_stack u16
derlc(const __be16 ** rlc_in,s16 * dwht_out,const __be16 * end_of_input)115dacca5f0SHans Verkuil derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
116dacca5f0SHans Verkuil {
117dacca5f0SHans Verkuil /* header */
118dacca5f0SHans Verkuil const __be16 *input = *rlc_in;
119dacca5f0SHans Verkuil u16 stat;
120dacca5f0SHans Verkuil int dec_count = 0;
121dacca5f0SHans Verkuil s16 block[8 * 8 + 16];
122dacca5f0SHans Verkuil s16 *wp = block;
123dacca5f0SHans Verkuil int i;
124dacca5f0SHans Verkuil
125dacca5f0SHans Verkuil if (input > end_of_input)
126dacca5f0SHans Verkuil return OVERFLOW_BIT;
127dacca5f0SHans Verkuil stat = ntohs(*input++);
128dacca5f0SHans Verkuil
129dacca5f0SHans Verkuil /*
130dacca5f0SHans Verkuil * Now de-compress, it expands one byte to up to 15 bytes
131dacca5f0SHans Verkuil * (or fills the remainder of the 64 bytes with zeroes if it
132dacca5f0SHans Verkuil * is the last byte to expand).
133dacca5f0SHans Verkuil *
134dacca5f0SHans Verkuil * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
135dacca5f0SHans Verkuil * allow for overflow if the incoming data was malformed.
136dacca5f0SHans Verkuil */
137dacca5f0SHans Verkuil while (dec_count < 8 * 8) {
138dacca5f0SHans Verkuil s16 in;
139dacca5f0SHans Verkuil int length;
140dacca5f0SHans Verkuil int coeff;
141dacca5f0SHans Verkuil
142dacca5f0SHans Verkuil if (input > end_of_input)
143dacca5f0SHans Verkuil return OVERFLOW_BIT;
144dacca5f0SHans Verkuil in = ntohs(*input++);
145dacca5f0SHans Verkuil length = in & 0xf;
146dacca5f0SHans Verkuil coeff = in >> 4;
147dacca5f0SHans Verkuil
148dacca5f0SHans Verkuil /* fill remainder with zeros */
149dacca5f0SHans Verkuil if (length == 15) {
150dacca5f0SHans Verkuil for (i = 0; i < 64 - dec_count; i++)
151dacca5f0SHans Verkuil *wp++ = 0;
152dacca5f0SHans Verkuil break;
153dacca5f0SHans Verkuil }
154dacca5f0SHans Verkuil
155dacca5f0SHans Verkuil for (i = 0; i < length; i++)
156dacca5f0SHans Verkuil *wp++ = 0;
157dacca5f0SHans Verkuil *wp++ = coeff;
158dacca5f0SHans Verkuil dec_count += length + 1;
159dacca5f0SHans Verkuil }
160dacca5f0SHans Verkuil
161dacca5f0SHans Verkuil wp = block;
162dacca5f0SHans Verkuil
163dacca5f0SHans Verkuil for (i = 0; i < 64; i++) {
164dacca5f0SHans Verkuil int pos = zigzag[i];
165dacca5f0SHans Verkuil int y = pos / 8;
166dacca5f0SHans Verkuil int x = pos % 8;
167dacca5f0SHans Verkuil
168dacca5f0SHans Verkuil dwht_out[x + y * 8] = *wp++;
169dacca5f0SHans Verkuil }
170dacca5f0SHans Verkuil *rlc_in = input;
171dacca5f0SHans Verkuil return stat;
172dacca5f0SHans Verkuil }
173dacca5f0SHans Verkuil
174dacca5f0SHans Verkuil static const int quant_table[] = {
175dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 2, 2,
176dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 2, 2,
177dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 2, 3,
178dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 3, 6,
179dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 3, 6, 6,
180dacca5f0SHans Verkuil 2, 2, 2, 2, 3, 6, 6, 6,
181dacca5f0SHans Verkuil 2, 2, 2, 3, 6, 6, 6, 6,
182dacca5f0SHans Verkuil 2, 2, 3, 6, 6, 6, 6, 8,
183dacca5f0SHans Verkuil };
184dacca5f0SHans Verkuil
185dacca5f0SHans Verkuil static const int quant_table_p[] = {
186dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 3,
187dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 3,
188dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 3,
189dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 6,
190dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 6, 6,
191dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 6, 6, 9,
192dacca5f0SHans Verkuil 3, 3, 3, 3, 6, 6, 9, 9,
193dacca5f0SHans Verkuil 3, 3, 3, 6, 6, 9, 9, 10,
194dacca5f0SHans Verkuil };
195dacca5f0SHans Verkuil
quantize_intra(s16 * coeff,s16 * de_coeff,u16 qp)196dacca5f0SHans Verkuil static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
197dacca5f0SHans Verkuil {
198dacca5f0SHans Verkuil const int *quant = quant_table;
199dacca5f0SHans Verkuil int i, j;
200dacca5f0SHans Verkuil
201dacca5f0SHans Verkuil for (j = 0; j < 8; j++) {
202dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
203dacca5f0SHans Verkuil *coeff >>= *quant;
204dacca5f0SHans Verkuil if (*coeff >= -qp && *coeff <= qp)
205dacca5f0SHans Verkuil *coeff = *de_coeff = 0;
206dacca5f0SHans Verkuil else
207dacca5f0SHans Verkuil *de_coeff = *coeff << *quant;
208dacca5f0SHans Verkuil }
209dacca5f0SHans Verkuil }
210dacca5f0SHans Verkuil }
211dacca5f0SHans Verkuil
dequantize_intra(s16 * coeff)212dacca5f0SHans Verkuil static void dequantize_intra(s16 *coeff)
213dacca5f0SHans Verkuil {
214dacca5f0SHans Verkuil const int *quant = quant_table;
215dacca5f0SHans Verkuil int i, j;
216dacca5f0SHans Verkuil
217dacca5f0SHans Verkuil for (j = 0; j < 8; j++)
218dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++)
219dacca5f0SHans Verkuil *coeff <<= *quant;
220dacca5f0SHans Verkuil }
221dacca5f0SHans Verkuil
quantize_inter(s16 * coeff,s16 * de_coeff,u16 qp)222dacca5f0SHans Verkuil static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
223dacca5f0SHans Verkuil {
224dacca5f0SHans Verkuil const int *quant = quant_table_p;
225dacca5f0SHans Verkuil int i, j;
226dacca5f0SHans Verkuil
227dacca5f0SHans Verkuil for (j = 0; j < 8; j++) {
228dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
229dacca5f0SHans Verkuil *coeff >>= *quant;
230dacca5f0SHans Verkuil if (*coeff >= -qp && *coeff <= qp)
231dacca5f0SHans Verkuil *coeff = *de_coeff = 0;
232dacca5f0SHans Verkuil else
233dacca5f0SHans Verkuil *de_coeff = *coeff << *quant;
234dacca5f0SHans Verkuil }
235dacca5f0SHans Verkuil }
236dacca5f0SHans Verkuil }
237dacca5f0SHans Verkuil
dequantize_inter(s16 * coeff)238dacca5f0SHans Verkuil static void dequantize_inter(s16 *coeff)
239dacca5f0SHans Verkuil {
240dacca5f0SHans Verkuil const int *quant = quant_table_p;
241dacca5f0SHans Verkuil int i, j;
242dacca5f0SHans Verkuil
243dacca5f0SHans Verkuil for (j = 0; j < 8; j++)
244dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++)
245dacca5f0SHans Verkuil *coeff <<= *quant;
246dacca5f0SHans Verkuil }
247dacca5f0SHans Verkuil
fwht(const u8 * block,s16 * output_block,unsigned int stride,unsigned int input_step,bool intra)248dacca5f0SHans Verkuil static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
249dacca5f0SHans Verkuil unsigned int stride,
250dacca5f0SHans Verkuil unsigned int input_step, bool intra)
251dacca5f0SHans Verkuil {
252dacca5f0SHans Verkuil /* we'll need more than 8 bits for the transformed coefficients */
253dacca5f0SHans Verkuil s32 workspace1[8], workspace2[8];
254dacca5f0SHans Verkuil const u8 *tmp = block;
255dacca5f0SHans Verkuil s16 *out = output_block;
256dacca5f0SHans Verkuil int add = intra ? 256 : 0;
257dacca5f0SHans Verkuil unsigned int i;
258dacca5f0SHans Verkuil
259dacca5f0SHans Verkuil /* stage 1 */
260dacca5f0SHans Verkuil for (i = 0; i < 8; i++, tmp += stride, out += 8) {
261dacca5f0SHans Verkuil switch (input_step) {
262dacca5f0SHans Verkuil case 1:
263dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[1] - add;
264dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[1];
265dacca5f0SHans Verkuil
266dacca5f0SHans Verkuil workspace1[2] = tmp[2] + tmp[3] - add;
267dacca5f0SHans Verkuil workspace1[3] = tmp[2] - tmp[3];
268dacca5f0SHans Verkuil
269dacca5f0SHans Verkuil workspace1[4] = tmp[4] + tmp[5] - add;
270dacca5f0SHans Verkuil workspace1[5] = tmp[4] - tmp[5];
271dacca5f0SHans Verkuil
272dacca5f0SHans Verkuil workspace1[6] = tmp[6] + tmp[7] - add;
273dacca5f0SHans Verkuil workspace1[7] = tmp[6] - tmp[7];
274dacca5f0SHans Verkuil break;
275dacca5f0SHans Verkuil case 2:
276dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[2] - add;
277dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[2];
278dacca5f0SHans Verkuil
279dacca5f0SHans Verkuil workspace1[2] = tmp[4] + tmp[6] - add;
280dacca5f0SHans Verkuil workspace1[3] = tmp[4] - tmp[6];
281dacca5f0SHans Verkuil
282dacca5f0SHans Verkuil workspace1[4] = tmp[8] + tmp[10] - add;
283dacca5f0SHans Verkuil workspace1[5] = tmp[8] - tmp[10];
284dacca5f0SHans Verkuil
285dacca5f0SHans Verkuil workspace1[6] = tmp[12] + tmp[14] - add;
286dacca5f0SHans Verkuil workspace1[7] = tmp[12] - tmp[14];
287dacca5f0SHans Verkuil break;
288dacca5f0SHans Verkuil case 3:
289dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[3] - add;
290dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[3];
291dacca5f0SHans Verkuil
292dacca5f0SHans Verkuil workspace1[2] = tmp[6] + tmp[9] - add;
293dacca5f0SHans Verkuil workspace1[3] = tmp[6] - tmp[9];
294dacca5f0SHans Verkuil
295dacca5f0SHans Verkuil workspace1[4] = tmp[12] + tmp[15] - add;
296dacca5f0SHans Verkuil workspace1[5] = tmp[12] - tmp[15];
297dacca5f0SHans Verkuil
298dacca5f0SHans Verkuil workspace1[6] = tmp[18] + tmp[21] - add;
299dacca5f0SHans Verkuil workspace1[7] = tmp[18] - tmp[21];
300dacca5f0SHans Verkuil break;
301dacca5f0SHans Verkuil default:
302dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[4] - add;
303dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[4];
304dacca5f0SHans Verkuil
305dacca5f0SHans Verkuil workspace1[2] = tmp[8] + tmp[12] - add;
306dacca5f0SHans Verkuil workspace1[3] = tmp[8] - tmp[12];
307dacca5f0SHans Verkuil
308dacca5f0SHans Verkuil workspace1[4] = tmp[16] + tmp[20] - add;
309dacca5f0SHans Verkuil workspace1[5] = tmp[16] - tmp[20];
310dacca5f0SHans Verkuil
311dacca5f0SHans Verkuil workspace1[6] = tmp[24] + tmp[28] - add;
312dacca5f0SHans Verkuil workspace1[7] = tmp[24] - tmp[28];
313dacca5f0SHans Verkuil break;
314dacca5f0SHans Verkuil }
315dacca5f0SHans Verkuil
316dacca5f0SHans Verkuil /* stage 2 */
317dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2];
318dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2];
319dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3];
320dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3];
321dacca5f0SHans Verkuil
322dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6];
323dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6];
324dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7];
325dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7];
326dacca5f0SHans Verkuil
327dacca5f0SHans Verkuil /* stage 3 */
328dacca5f0SHans Verkuil out[0] = workspace2[0] + workspace2[4];
329dacca5f0SHans Verkuil out[1] = workspace2[0] - workspace2[4];
330dacca5f0SHans Verkuil out[2] = workspace2[1] - workspace2[5];
331dacca5f0SHans Verkuil out[3] = workspace2[1] + workspace2[5];
332dacca5f0SHans Verkuil out[4] = workspace2[2] + workspace2[6];
333dacca5f0SHans Verkuil out[5] = workspace2[2] - workspace2[6];
334dacca5f0SHans Verkuil out[6] = workspace2[3] - workspace2[7];
335dacca5f0SHans Verkuil out[7] = workspace2[3] + workspace2[7];
336dacca5f0SHans Verkuil }
337dacca5f0SHans Verkuil
338dacca5f0SHans Verkuil out = output_block;
339dacca5f0SHans Verkuil
340dacca5f0SHans Verkuil for (i = 0; i < 8; i++, out++) {
341dacca5f0SHans Verkuil /* stage 1 */
342dacca5f0SHans Verkuil workspace1[0] = out[0] + out[1 * 8];
343dacca5f0SHans Verkuil workspace1[1] = out[0] - out[1 * 8];
344dacca5f0SHans Verkuil
345dacca5f0SHans Verkuil workspace1[2] = out[2 * 8] + out[3 * 8];
346dacca5f0SHans Verkuil workspace1[3] = out[2 * 8] - out[3 * 8];
347dacca5f0SHans Verkuil
348dacca5f0SHans Verkuil workspace1[4] = out[4 * 8] + out[5 * 8];
349dacca5f0SHans Verkuil workspace1[5] = out[4 * 8] - out[5 * 8];
350dacca5f0SHans Verkuil
351dacca5f0SHans Verkuil workspace1[6] = out[6 * 8] + out[7 * 8];
352dacca5f0SHans Verkuil workspace1[7] = out[6 * 8] - out[7 * 8];
353dacca5f0SHans Verkuil
354dacca5f0SHans Verkuil /* stage 2 */
355dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2];
356dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2];
357dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3];
358dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3];
359dacca5f0SHans Verkuil
360dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6];
361dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6];
362dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7];
363dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7];
364dacca5f0SHans Verkuil /* stage 3 */
365dacca5f0SHans Verkuil out[0 * 8] = workspace2[0] + workspace2[4];
366dacca5f0SHans Verkuil out[1 * 8] = workspace2[0] - workspace2[4];
367dacca5f0SHans Verkuil out[2 * 8] = workspace2[1] - workspace2[5];
368dacca5f0SHans Verkuil out[3 * 8] = workspace2[1] + workspace2[5];
369dacca5f0SHans Verkuil out[4 * 8] = workspace2[2] + workspace2[6];
370dacca5f0SHans Verkuil out[5 * 8] = workspace2[2] - workspace2[6];
371dacca5f0SHans Verkuil out[6 * 8] = workspace2[3] - workspace2[7];
372dacca5f0SHans Verkuil out[7 * 8] = workspace2[3] + workspace2[7];
373dacca5f0SHans Verkuil }
374dacca5f0SHans Verkuil }
375dacca5f0SHans Verkuil
376dacca5f0SHans Verkuil /*
377dacca5f0SHans Verkuil * Not the nicest way of doing it, but P-blocks get twice the range of
378dacca5f0SHans Verkuil * that of the I-blocks. Therefore we need a type bigger than 8 bits.
379dacca5f0SHans Verkuil * Furthermore values can be negative... This is just a version that
380dacca5f0SHans Verkuil * works with 16 signed data
381dacca5f0SHans Verkuil */
382dacca5f0SHans Verkuil static void noinline_for_stack
fwht16(const s16 * block,s16 * output_block,int stride,int intra)383dacca5f0SHans Verkuil fwht16(const s16 *block, s16 *output_block, int stride, int intra)
384dacca5f0SHans Verkuil {
385dacca5f0SHans Verkuil /* we'll need more than 8 bits for the transformed coefficients */
386dacca5f0SHans Verkuil s32 workspace1[8], workspace2[8];
387dacca5f0SHans Verkuil const s16 *tmp = block;
388dacca5f0SHans Verkuil s16 *out = output_block;
389dacca5f0SHans Verkuil int i;
390dacca5f0SHans Verkuil
391dacca5f0SHans Verkuil for (i = 0; i < 8; i++, tmp += stride, out += 8) {
392dacca5f0SHans Verkuil /* stage 1 */
393dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[1];
394dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[1];
395dacca5f0SHans Verkuil
396dacca5f0SHans Verkuil workspace1[2] = tmp[2] + tmp[3];
397dacca5f0SHans Verkuil workspace1[3] = tmp[2] - tmp[3];
398dacca5f0SHans Verkuil
399dacca5f0SHans Verkuil workspace1[4] = tmp[4] + tmp[5];
400dacca5f0SHans Verkuil workspace1[5] = tmp[4] - tmp[5];
401dacca5f0SHans Verkuil
402dacca5f0SHans Verkuil workspace1[6] = tmp[6] + tmp[7];
403dacca5f0SHans Verkuil workspace1[7] = tmp[6] - tmp[7];
404dacca5f0SHans Verkuil
405dacca5f0SHans Verkuil /* stage 2 */
406dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2];
407dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2];
408dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3];
409dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3];
410dacca5f0SHans Verkuil
411dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6];
412dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6];
413dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7];
414dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7];
415dacca5f0SHans Verkuil
416dacca5f0SHans Verkuil /* stage 3 */
417dacca5f0SHans Verkuil out[0] = workspace2[0] + workspace2[4];
418dacca5f0SHans Verkuil out[1] = workspace2[0] - workspace2[4];
419dacca5f0SHans Verkuil out[2] = workspace2[1] - workspace2[5];
420dacca5f0SHans Verkuil out[3] = workspace2[1] + workspace2[5];
421dacca5f0SHans Verkuil out[4] = workspace2[2] + workspace2[6];
422dacca5f0SHans Verkuil out[5] = workspace2[2] - workspace2[6];
423dacca5f0SHans Verkuil out[6] = workspace2[3] - workspace2[7];
424dacca5f0SHans Verkuil out[7] = workspace2[3] + workspace2[7];
425dacca5f0SHans Verkuil }
426dacca5f0SHans Verkuil
427dacca5f0SHans Verkuil out = output_block;
428dacca5f0SHans Verkuil
429dacca5f0SHans Verkuil for (i = 0; i < 8; i++, out++) {
430dacca5f0SHans Verkuil /* stage 1 */
431dacca5f0SHans Verkuil workspace1[0] = out[0] + out[1*8];
432dacca5f0SHans Verkuil workspace1[1] = out[0] - out[1*8];
433dacca5f0SHans Verkuil
434dacca5f0SHans Verkuil workspace1[2] = out[2*8] + out[3*8];
435dacca5f0SHans Verkuil workspace1[3] = out[2*8] - out[3*8];
436dacca5f0SHans Verkuil
437dacca5f0SHans Verkuil workspace1[4] = out[4*8] + out[5*8];
438dacca5f0SHans Verkuil workspace1[5] = out[4*8] - out[5*8];
439dacca5f0SHans Verkuil
440dacca5f0SHans Verkuil workspace1[6] = out[6*8] + out[7*8];
441dacca5f0SHans Verkuil workspace1[7] = out[6*8] - out[7*8];
442dacca5f0SHans Verkuil
443dacca5f0SHans Verkuil /* stage 2 */
444dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2];
445dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2];
446dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3];
447dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3];
448dacca5f0SHans Verkuil
449dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6];
450dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6];
451dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7];
452dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7];
453dacca5f0SHans Verkuil
454dacca5f0SHans Verkuil /* stage 3 */
455dacca5f0SHans Verkuil out[0*8] = workspace2[0] + workspace2[4];
456dacca5f0SHans Verkuil out[1*8] = workspace2[0] - workspace2[4];
457dacca5f0SHans Verkuil out[2*8] = workspace2[1] - workspace2[5];
458dacca5f0SHans Verkuil out[3*8] = workspace2[1] + workspace2[5];
459dacca5f0SHans Verkuil out[4*8] = workspace2[2] + workspace2[6];
460dacca5f0SHans Verkuil out[5*8] = workspace2[2] - workspace2[6];
461dacca5f0SHans Verkuil out[6*8] = workspace2[3] - workspace2[7];
462dacca5f0SHans Verkuil out[7*8] = workspace2[3] + workspace2[7];
463dacca5f0SHans Verkuil }
464dacca5f0SHans Verkuil }
465dacca5f0SHans Verkuil
466dacca5f0SHans Verkuil static noinline_for_stack void
ifwht(const s16 * block,s16 * output_block,int intra)467dacca5f0SHans Verkuil ifwht(const s16 *block, s16 *output_block, int intra)
468dacca5f0SHans Verkuil {
469dacca5f0SHans Verkuil /*
470dacca5f0SHans Verkuil * we'll need more than 8 bits for the transformed coefficients
471dacca5f0SHans Verkuil * use native unit of cpu
472dacca5f0SHans Verkuil */
473dacca5f0SHans Verkuil int workspace1[8], workspace2[8];
474dacca5f0SHans Verkuil int inter = intra ? 0 : 1;
475dacca5f0SHans Verkuil const s16 *tmp = block;
476dacca5f0SHans Verkuil s16 *out = output_block;
477dacca5f0SHans Verkuil int i;
478dacca5f0SHans Verkuil
479dacca5f0SHans Verkuil for (i = 0; i < 8; i++, tmp += 8, out += 8) {
480dacca5f0SHans Verkuil /* stage 1 */
481dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[1];
482dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[1];
483dacca5f0SHans Verkuil
484dacca5f0SHans Verkuil workspace1[2] = tmp[2] + tmp[3];
485dacca5f0SHans Verkuil workspace1[3] = tmp[2] - tmp[3];
486dacca5f0SHans Verkuil
487dacca5f0SHans Verkuil workspace1[4] = tmp[4] + tmp[5];
488dacca5f0SHans Verkuil workspace1[5] = tmp[4] - tmp[5];
489dacca5f0SHans Verkuil
490dacca5f0SHans Verkuil workspace1[6] = tmp[6] + tmp[7];
491dacca5f0SHans Verkuil workspace1[7] = tmp[6] - tmp[7];
492dacca5f0SHans Verkuil
493dacca5f0SHans Verkuil /* stage 2 */
494dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2];
495dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2];
496dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3];
497dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3];
498dacca5f0SHans Verkuil
499dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6];
500dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6];
501dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7];
502dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7];
503dacca5f0SHans Verkuil
504dacca5f0SHans Verkuil /* stage 3 */
505dacca5f0SHans Verkuil out[0] = workspace2[0] + workspace2[4];
506dacca5f0SHans Verkuil out[1] = workspace2[0] - workspace2[4];
507dacca5f0SHans Verkuil out[2] = workspace2[1] - workspace2[5];
508dacca5f0SHans Verkuil out[3] = workspace2[1] + workspace2[5];
509dacca5f0SHans Verkuil out[4] = workspace2[2] + workspace2[6];
510dacca5f0SHans Verkuil out[5] = workspace2[2] - workspace2[6];
511dacca5f0SHans Verkuil out[6] = workspace2[3] - workspace2[7];
512dacca5f0SHans Verkuil out[7] = workspace2[3] + workspace2[7];
513dacca5f0SHans Verkuil }
514dacca5f0SHans Verkuil
515dacca5f0SHans Verkuil out = output_block;
516dacca5f0SHans Verkuil
517dacca5f0SHans Verkuil for (i = 0; i < 8; i++, out++) {
518dacca5f0SHans Verkuil /* stage 1 */
519dacca5f0SHans Verkuil workspace1[0] = out[0] + out[1 * 8];
520dacca5f0SHans Verkuil workspace1[1] = out[0] - out[1 * 8];
521dacca5f0SHans Verkuil
522dacca5f0SHans Verkuil workspace1[2] = out[2 * 8] + out[3 * 8];
523dacca5f0SHans Verkuil workspace1[3] = out[2 * 8] - out[3 * 8];
524dacca5f0SHans Verkuil
525dacca5f0SHans Verkuil workspace1[4] = out[4 * 8] + out[5 * 8];
526dacca5f0SHans Verkuil workspace1[5] = out[4 * 8] - out[5 * 8];
527dacca5f0SHans Verkuil
528dacca5f0SHans Verkuil workspace1[6] = out[6 * 8] + out[7 * 8];
529dacca5f0SHans Verkuil workspace1[7] = out[6 * 8] - out[7 * 8];
530dacca5f0SHans Verkuil
531dacca5f0SHans Verkuil /* stage 2 */
532dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2];
533dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2];
534dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3];
535dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3];
536dacca5f0SHans Verkuil
537dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6];
538dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6];
539dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7];
540dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7];
541dacca5f0SHans Verkuil
542dacca5f0SHans Verkuil /* stage 3 */
543dacca5f0SHans Verkuil if (inter) {
544dacca5f0SHans Verkuil int d;
545dacca5f0SHans Verkuil
546dacca5f0SHans Verkuil out[0 * 8] = workspace2[0] + workspace2[4];
547dacca5f0SHans Verkuil out[1 * 8] = workspace2[0] - workspace2[4];
548dacca5f0SHans Verkuil out[2 * 8] = workspace2[1] - workspace2[5];
549dacca5f0SHans Verkuil out[3 * 8] = workspace2[1] + workspace2[5];
550dacca5f0SHans Verkuil out[4 * 8] = workspace2[2] + workspace2[6];
551dacca5f0SHans Verkuil out[5 * 8] = workspace2[2] - workspace2[6];
552dacca5f0SHans Verkuil out[6 * 8] = workspace2[3] - workspace2[7];
553dacca5f0SHans Verkuil out[7 * 8] = workspace2[3] + workspace2[7];
554dacca5f0SHans Verkuil
555dacca5f0SHans Verkuil for (d = 0; d < 8; d++)
556dacca5f0SHans Verkuil out[8 * d] >>= 6;
557dacca5f0SHans Verkuil } else {
558dacca5f0SHans Verkuil int d;
559dacca5f0SHans Verkuil
560dacca5f0SHans Verkuil out[0 * 8] = workspace2[0] + workspace2[4];
561dacca5f0SHans Verkuil out[1 * 8] = workspace2[0] - workspace2[4];
562dacca5f0SHans Verkuil out[2 * 8] = workspace2[1] - workspace2[5];
563dacca5f0SHans Verkuil out[3 * 8] = workspace2[1] + workspace2[5];
564dacca5f0SHans Verkuil out[4 * 8] = workspace2[2] + workspace2[6];
565dacca5f0SHans Verkuil out[5 * 8] = workspace2[2] - workspace2[6];
566dacca5f0SHans Verkuil out[6 * 8] = workspace2[3] - workspace2[7];
567dacca5f0SHans Verkuil out[7 * 8] = workspace2[3] + workspace2[7];
568dacca5f0SHans Verkuil
569dacca5f0SHans Verkuil for (d = 0; d < 8; d++) {
570dacca5f0SHans Verkuil out[8 * d] >>= 6;
571dacca5f0SHans Verkuil out[8 * d] += 128;
572dacca5f0SHans Verkuil }
573dacca5f0SHans Verkuil }
574dacca5f0SHans Verkuil }
575dacca5f0SHans Verkuil }
576dacca5f0SHans Verkuil
fill_encoder_block(const u8 * input,s16 * dst,unsigned int stride,unsigned int input_step)577dacca5f0SHans Verkuil static void fill_encoder_block(const u8 *input, s16 *dst,
578dacca5f0SHans Verkuil unsigned int stride, unsigned int input_step)
579dacca5f0SHans Verkuil {
580dacca5f0SHans Verkuil int i, j;
581dacca5f0SHans Verkuil
582dacca5f0SHans Verkuil for (i = 0; i < 8; i++) {
583dacca5f0SHans Verkuil for (j = 0; j < 8; j++, input += input_step)
584dacca5f0SHans Verkuil *dst++ = *input;
585dacca5f0SHans Verkuil input += stride - 8 * input_step;
586dacca5f0SHans Verkuil }
587dacca5f0SHans Verkuil }
588dacca5f0SHans Verkuil
var_intra(const s16 * input)589dacca5f0SHans Verkuil static int var_intra(const s16 *input)
590dacca5f0SHans Verkuil {
591dacca5f0SHans Verkuil int32_t mean = 0;
592dacca5f0SHans Verkuil int32_t ret = 0;
593dacca5f0SHans Verkuil const s16 *tmp = input;
594dacca5f0SHans Verkuil int i;
595dacca5f0SHans Verkuil
596dacca5f0SHans Verkuil for (i = 0; i < 8 * 8; i++, tmp++)
597dacca5f0SHans Verkuil mean += *tmp;
598dacca5f0SHans Verkuil mean /= 64;
599dacca5f0SHans Verkuil tmp = input;
600dacca5f0SHans Verkuil for (i = 0; i < 8 * 8; i++, tmp++)
601dacca5f0SHans Verkuil ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
602dacca5f0SHans Verkuil return ret;
603dacca5f0SHans Verkuil }
604dacca5f0SHans Verkuil
var_inter(const s16 * old,const s16 * new)605dacca5f0SHans Verkuil static int var_inter(const s16 *old, const s16 *new)
606dacca5f0SHans Verkuil {
607dacca5f0SHans Verkuil int32_t ret = 0;
608dacca5f0SHans Verkuil int i;
609dacca5f0SHans Verkuil
610dacca5f0SHans Verkuil for (i = 0; i < 8 * 8; i++, old++, new++)
611dacca5f0SHans Verkuil ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
612dacca5f0SHans Verkuil return ret;
613dacca5f0SHans Verkuil }
614dacca5f0SHans Verkuil
615dacca5f0SHans Verkuil static noinline_for_stack int
decide_blocktype(const u8 * cur,const u8 * reference,s16 * deltablock,unsigned int stride,unsigned int input_step)616dacca5f0SHans Verkuil decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
617dacca5f0SHans Verkuil unsigned int stride, unsigned int input_step)
618dacca5f0SHans Verkuil {
619dacca5f0SHans Verkuil s16 tmp[64];
620dacca5f0SHans Verkuil s16 old[64];
621dacca5f0SHans Verkuil s16 *work = tmp;
622dacca5f0SHans Verkuil unsigned int k, l;
623dacca5f0SHans Verkuil int vari;
624dacca5f0SHans Verkuil int vard;
625dacca5f0SHans Verkuil
626dacca5f0SHans Verkuil fill_encoder_block(cur, tmp, stride, input_step);
627dacca5f0SHans Verkuil fill_encoder_block(reference, old, 8, 1);
628dacca5f0SHans Verkuil vari = var_intra(tmp);
629dacca5f0SHans Verkuil
630dacca5f0SHans Verkuil for (k = 0; k < 8; k++) {
631dacca5f0SHans Verkuil for (l = 0; l < 8; l++) {
632dacca5f0SHans Verkuil *deltablock = *work - *reference;
633dacca5f0SHans Verkuil deltablock++;
634dacca5f0SHans Verkuil work++;
635dacca5f0SHans Verkuil reference++;
636dacca5f0SHans Verkuil }
637dacca5f0SHans Verkuil }
638dacca5f0SHans Verkuil deltablock -= 64;
639dacca5f0SHans Verkuil vard = var_inter(old, tmp);
640dacca5f0SHans Verkuil return vari <= vard ? IBLOCK : PBLOCK;
641dacca5f0SHans Verkuil }
642dacca5f0SHans Verkuil
fill_decoder_block(u8 * dst,const s16 * input,int stride,unsigned int dst_step)643dacca5f0SHans Verkuil static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
644dacca5f0SHans Verkuil unsigned int dst_step)
645dacca5f0SHans Verkuil {
646dacca5f0SHans Verkuil int i, j;
647dacca5f0SHans Verkuil
648dacca5f0SHans Verkuil for (i = 0; i < 8; i++) {
649dacca5f0SHans Verkuil for (j = 0; j < 8; j++, input++, dst += dst_step) {
650dacca5f0SHans Verkuil if (*input < 0)
651dacca5f0SHans Verkuil *dst = 0;
652dacca5f0SHans Verkuil else if (*input > 255)
653dacca5f0SHans Verkuil *dst = 255;
654dacca5f0SHans Verkuil else
655dacca5f0SHans Verkuil *dst = *input;
656dacca5f0SHans Verkuil }
657dacca5f0SHans Verkuil dst += stride - (8 * dst_step);
658dacca5f0SHans Verkuil }
659dacca5f0SHans Verkuil }
660dacca5f0SHans Verkuil
add_deltas(s16 * deltas,const u8 * ref,int stride,unsigned int ref_step)661dacca5f0SHans Verkuil static void add_deltas(s16 *deltas, const u8 *ref, int stride,
662dacca5f0SHans Verkuil unsigned int ref_step)
663dacca5f0SHans Verkuil {
664dacca5f0SHans Verkuil int k, l;
665dacca5f0SHans Verkuil
666dacca5f0SHans Verkuil for (k = 0; k < 8; k++) {
667dacca5f0SHans Verkuil for (l = 0; l < 8; l++) {
668dacca5f0SHans Verkuil *deltas += *ref;
669dacca5f0SHans Verkuil ref += ref_step;
670dacca5f0SHans Verkuil /*
671dacca5f0SHans Verkuil * Due to quantizing, it might possible that the
672dacca5f0SHans Verkuil * decoded coefficients are slightly out of range
673dacca5f0SHans Verkuil */
674dacca5f0SHans Verkuil if (*deltas < 0)
675dacca5f0SHans Verkuil *deltas = 0;
676dacca5f0SHans Verkuil else if (*deltas > 255)
677dacca5f0SHans Verkuil *deltas = 255;
678dacca5f0SHans Verkuil deltas++;
679dacca5f0SHans Verkuil }
680dacca5f0SHans Verkuil ref += stride - (8 * ref_step);
681dacca5f0SHans Verkuil }
682dacca5f0SHans Verkuil }
683dacca5f0SHans Verkuil
encode_plane(u8 * input,u8 * refp,__be16 ** rlco,__be16 * rlco_max,struct fwht_cframe * cf,u32 height,u32 width,u32 stride,unsigned int input_step,bool is_intra,bool next_is_intra)684dacca5f0SHans Verkuil static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
685dacca5f0SHans Verkuil struct fwht_cframe *cf, u32 height, u32 width,
686dacca5f0SHans Verkuil u32 stride, unsigned int input_step,
687dacca5f0SHans Verkuil bool is_intra, bool next_is_intra)
688dacca5f0SHans Verkuil {
689dacca5f0SHans Verkuil u8 *input_start = input;
690dacca5f0SHans Verkuil __be16 *rlco_start = *rlco;
691dacca5f0SHans Verkuil s16 deltablock[64];
692dacca5f0SHans Verkuil __be16 pframe_bit = htons(PFRAME_BIT);
693dacca5f0SHans Verkuil u32 encoding = 0;
694dacca5f0SHans Verkuil unsigned int last_size = 0;
695dacca5f0SHans Verkuil unsigned int i, j;
696dacca5f0SHans Verkuil
697dacca5f0SHans Verkuil width = round_up(width, 8);
698dacca5f0SHans Verkuil height = round_up(height, 8);
699dacca5f0SHans Verkuil
700dacca5f0SHans Verkuil for (j = 0; j < height / 8; j++) {
701dacca5f0SHans Verkuil input = input_start + j * 8 * stride;
702dacca5f0SHans Verkuil for (i = 0; i < width / 8; i++) {
703dacca5f0SHans Verkuil /* intra code, first frame is always intra coded. */
704dacca5f0SHans Verkuil int blocktype = IBLOCK;
705dacca5f0SHans Verkuil unsigned int size;
706dacca5f0SHans Verkuil
707dacca5f0SHans Verkuil if (!is_intra)
708dacca5f0SHans Verkuil blocktype = decide_blocktype(input, refp,
709dacca5f0SHans Verkuil deltablock, stride, input_step);
710dacca5f0SHans Verkuil if (blocktype == IBLOCK) {
711dacca5f0SHans Verkuil fwht(input, cf->coeffs, stride, input_step, 1);
712dacca5f0SHans Verkuil quantize_intra(cf->coeffs, cf->de_coeffs,
713dacca5f0SHans Verkuil cf->i_frame_qp);
714dacca5f0SHans Verkuil } else {
715dacca5f0SHans Verkuil /* inter code */
716dacca5f0SHans Verkuil encoding |= FWHT_FRAME_PCODED;
717dacca5f0SHans Verkuil fwht16(deltablock, cf->coeffs, 8, 0);
718dacca5f0SHans Verkuil quantize_inter(cf->coeffs, cf->de_coeffs,
719dacca5f0SHans Verkuil cf->p_frame_qp);
720dacca5f0SHans Verkuil }
721dacca5f0SHans Verkuil if (!next_is_intra) {
722dacca5f0SHans Verkuil ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
723dacca5f0SHans Verkuil
724dacca5f0SHans Verkuil if (blocktype == PBLOCK)
725dacca5f0SHans Verkuil add_deltas(cf->de_fwht, refp, 8, 1);
726dacca5f0SHans Verkuil fill_decoder_block(refp, cf->de_fwht, 8, 1);
727dacca5f0SHans Verkuil }
728dacca5f0SHans Verkuil
729dacca5f0SHans Verkuil input += 8 * input_step;
730dacca5f0SHans Verkuil refp += 8 * 8;
731dacca5f0SHans Verkuil
732dacca5f0SHans Verkuil size = rlc(cf->coeffs, *rlco, blocktype);
733dacca5f0SHans Verkuil if (last_size == size &&
734dacca5f0SHans Verkuil !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
735dacca5f0SHans Verkuil __be16 *last_rlco = *rlco - size;
736dacca5f0SHans Verkuil s16 hdr = ntohs(*last_rlco);
737dacca5f0SHans Verkuil
738dacca5f0SHans Verkuil if (!((*last_rlco ^ **rlco) & pframe_bit) &&
739dacca5f0SHans Verkuil (hdr & DUPS_MASK) < DUPS_MASK)
740dacca5f0SHans Verkuil *last_rlco = htons(hdr + 2);
741dacca5f0SHans Verkuil else
742dacca5f0SHans Verkuil *rlco += size;
743dacca5f0SHans Verkuil } else {
744dacca5f0SHans Verkuil *rlco += size;
745dacca5f0SHans Verkuil }
746dacca5f0SHans Verkuil if (*rlco >= rlco_max) {
747dacca5f0SHans Verkuil encoding |= FWHT_FRAME_UNENCODED;
748dacca5f0SHans Verkuil goto exit_loop;
749dacca5f0SHans Verkuil }
750dacca5f0SHans Verkuil last_size = size;
751dacca5f0SHans Verkuil }
752dacca5f0SHans Verkuil }
753dacca5f0SHans Verkuil
754dacca5f0SHans Verkuil exit_loop:
755dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED) {
756dacca5f0SHans Verkuil u8 *out = (u8 *)rlco_start;
757dacca5f0SHans Verkuil u8 *p;
758dacca5f0SHans Verkuil
759dacca5f0SHans Verkuil input = input_start;
760dacca5f0SHans Verkuil /*
761dacca5f0SHans Verkuil * The compressed stream should never contain the magic
762dacca5f0SHans Verkuil * header, so when we copy the YUV data we replace 0xff
763dacca5f0SHans Verkuil * by 0xfe. Since YUV is limited range such values
764dacca5f0SHans Verkuil * shouldn't appear anyway.
765dacca5f0SHans Verkuil */
766dacca5f0SHans Verkuil for (j = 0; j < height; j++) {
767dacca5f0SHans Verkuil for (i = 0, p = input; i < width; i++, p += input_step)
768dacca5f0SHans Verkuil *out++ = (*p == 0xff) ? 0xfe : *p;
769dacca5f0SHans Verkuil input += stride;
770dacca5f0SHans Verkuil }
771dacca5f0SHans Verkuil *rlco = (__be16 *)out;
772dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_PCODED;
773dacca5f0SHans Verkuil }
774dacca5f0SHans Verkuil return encoding;
775dacca5f0SHans Verkuil }
776dacca5f0SHans Verkuil
fwht_encode_frame(struct fwht_raw_frame * frm,struct fwht_raw_frame * ref_frm,struct fwht_cframe * cf,bool is_intra,bool next_is_intra,unsigned int width,unsigned int height,unsigned int stride,unsigned int chroma_stride)777dacca5f0SHans Verkuil u32 fwht_encode_frame(struct fwht_raw_frame *frm,
778dacca5f0SHans Verkuil struct fwht_raw_frame *ref_frm,
779dacca5f0SHans Verkuil struct fwht_cframe *cf,
780dacca5f0SHans Verkuil bool is_intra, bool next_is_intra,
781dacca5f0SHans Verkuil unsigned int width, unsigned int height,
782dacca5f0SHans Verkuil unsigned int stride, unsigned int chroma_stride)
783dacca5f0SHans Verkuil {
784dacca5f0SHans Verkuil unsigned int size = height * width;
785dacca5f0SHans Verkuil __be16 *rlco = cf->rlc_data;
786dacca5f0SHans Verkuil __be16 *rlco_max;
787dacca5f0SHans Verkuil u32 encoding;
788dacca5f0SHans Verkuil
789dacca5f0SHans Verkuil rlco_max = rlco + size / 2 - 256;
790dacca5f0SHans Verkuil encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
791dacca5f0SHans Verkuil height, width, stride,
792dacca5f0SHans Verkuil frm->luma_alpha_step, is_intra, next_is_intra);
793dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED)
794dacca5f0SHans Verkuil encoding |= FWHT_LUMA_UNENCODED;
795dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED;
796dacca5f0SHans Verkuil
797dacca5f0SHans Verkuil if (frm->components_num >= 3) {
798dacca5f0SHans Verkuil u32 chroma_h = height / frm->height_div;
799dacca5f0SHans Verkuil u32 chroma_w = width / frm->width_div;
800dacca5f0SHans Verkuil unsigned int chroma_size = chroma_h * chroma_w;
801dacca5f0SHans Verkuil
802dacca5f0SHans Verkuil rlco_max = rlco + chroma_size / 2 - 256;
803dacca5f0SHans Verkuil encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
804dacca5f0SHans Verkuil cf, chroma_h, chroma_w,
805dacca5f0SHans Verkuil chroma_stride, frm->chroma_step,
806dacca5f0SHans Verkuil is_intra, next_is_intra);
807dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED)
808dacca5f0SHans Verkuil encoding |= FWHT_CB_UNENCODED;
809dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED;
810dacca5f0SHans Verkuil rlco_max = rlco + chroma_size / 2 - 256;
811dacca5f0SHans Verkuil encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
812dacca5f0SHans Verkuil cf, chroma_h, chroma_w,
813dacca5f0SHans Verkuil chroma_stride, frm->chroma_step,
814dacca5f0SHans Verkuil is_intra, next_is_intra);
815dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED)
816dacca5f0SHans Verkuil encoding |= FWHT_CR_UNENCODED;
817dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED;
818dacca5f0SHans Verkuil }
819dacca5f0SHans Verkuil
820dacca5f0SHans Verkuil if (frm->components_num == 4) {
821dacca5f0SHans Verkuil rlco_max = rlco + size / 2 - 256;
822dacca5f0SHans Verkuil encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
823dacca5f0SHans Verkuil rlco_max, cf, height, width,
824dacca5f0SHans Verkuil stride, frm->luma_alpha_step,
825dacca5f0SHans Verkuil is_intra, next_is_intra);
826dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED)
827dacca5f0SHans Verkuil encoding |= FWHT_ALPHA_UNENCODED;
828dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED;
829dacca5f0SHans Verkuil }
830dacca5f0SHans Verkuil
831dacca5f0SHans Verkuil cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
832dacca5f0SHans Verkuil return encoding;
833dacca5f0SHans Verkuil }
834dacca5f0SHans Verkuil
decode_plane(struct fwht_cframe * cf,const __be16 ** rlco,u32 height,u32 width,const u8 * ref,u32 ref_stride,unsigned int ref_step,u8 * dst,unsigned int dst_stride,unsigned int dst_step,bool uncompressed,const __be16 * end_of_rlco_buf)835dacca5f0SHans Verkuil static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
836dacca5f0SHans Verkuil u32 height, u32 width, const u8 *ref, u32 ref_stride,
837dacca5f0SHans Verkuil unsigned int ref_step, u8 *dst,
838dacca5f0SHans Verkuil unsigned int dst_stride, unsigned int dst_step,
839dacca5f0SHans Verkuil bool uncompressed, const __be16 *end_of_rlco_buf)
840dacca5f0SHans Verkuil {
841dacca5f0SHans Verkuil unsigned int copies = 0;
842dacca5f0SHans Verkuil s16 copy[8 * 8];
843dacca5f0SHans Verkuil u16 stat;
844dacca5f0SHans Verkuil unsigned int i, j;
845dacca5f0SHans Verkuil bool is_intra = !ref;
846dacca5f0SHans Verkuil
847dacca5f0SHans Verkuil width = round_up(width, 8);
848dacca5f0SHans Verkuil height = round_up(height, 8);
849dacca5f0SHans Verkuil
850dacca5f0SHans Verkuil if (uncompressed) {
851dacca5f0SHans Verkuil int i;
852dacca5f0SHans Verkuil
853dacca5f0SHans Verkuil if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
854dacca5f0SHans Verkuil return false;
855dacca5f0SHans Verkuil for (i = 0; i < height; i++) {
856dacca5f0SHans Verkuil memcpy(dst, *rlco, width);
857dacca5f0SHans Verkuil dst += dst_stride;
858dacca5f0SHans Verkuil *rlco += width / 2;
859dacca5f0SHans Verkuil }
860dacca5f0SHans Verkuil return true;
861dacca5f0SHans Verkuil }
862dacca5f0SHans Verkuil
863dacca5f0SHans Verkuil /*
864dacca5f0SHans Verkuil * When decoding each macroblock the rlco pointer will be increased
865dacca5f0SHans Verkuil * by 65 * 2 bytes worst-case.
866dacca5f0SHans Verkuil * To avoid overflow the buffer has to be 65/64th of the actual raw
867dacca5f0SHans Verkuil * image size, just in case someone feeds it malicious data.
868dacca5f0SHans Verkuil */
869dacca5f0SHans Verkuil for (j = 0; j < height / 8; j++) {
870dacca5f0SHans Verkuil for (i = 0; i < width / 8; i++) {
871dacca5f0SHans Verkuil const u8 *refp = ref + j * 8 * ref_stride +
872dacca5f0SHans Verkuil i * 8 * ref_step;
873dacca5f0SHans Verkuil u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
874dacca5f0SHans Verkuil
875dacca5f0SHans Verkuil if (copies) {
876dacca5f0SHans Verkuil memcpy(cf->de_fwht, copy, sizeof(copy));
877dacca5f0SHans Verkuil if ((stat & PFRAME_BIT) && !is_intra)
878dacca5f0SHans Verkuil add_deltas(cf->de_fwht, refp,
879dacca5f0SHans Verkuil ref_stride, ref_step);
880dacca5f0SHans Verkuil fill_decoder_block(dstp, cf->de_fwht,
881dacca5f0SHans Verkuil dst_stride, dst_step);
882dacca5f0SHans Verkuil copies--;
883dacca5f0SHans Verkuil continue;
884dacca5f0SHans Verkuil }
885dacca5f0SHans Verkuil
886dacca5f0SHans Verkuil stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
887dacca5f0SHans Verkuil if (stat & OVERFLOW_BIT)
888dacca5f0SHans Verkuil return false;
889dacca5f0SHans Verkuil if ((stat & PFRAME_BIT) && !is_intra)
890dacca5f0SHans Verkuil dequantize_inter(cf->coeffs);
891dacca5f0SHans Verkuil else
892dacca5f0SHans Verkuil dequantize_intra(cf->coeffs);
893dacca5f0SHans Verkuil
894dacca5f0SHans Verkuil ifwht(cf->coeffs, cf->de_fwht,
895dacca5f0SHans Verkuil ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
896dacca5f0SHans Verkuil
897dacca5f0SHans Verkuil copies = (stat & DUPS_MASK) >> 1;
898dacca5f0SHans Verkuil if (copies)
899dacca5f0SHans Verkuil memcpy(copy, cf->de_fwht, sizeof(copy));
900dacca5f0SHans Verkuil if ((stat & PFRAME_BIT) && !is_intra)
901dacca5f0SHans Verkuil add_deltas(cf->de_fwht, refp,
902dacca5f0SHans Verkuil ref_stride, ref_step);
903dacca5f0SHans Verkuil fill_decoder_block(dstp, cf->de_fwht, dst_stride,
904dacca5f0SHans Verkuil dst_step);
905dacca5f0SHans Verkuil }
906dacca5f0SHans Verkuil }
907dacca5f0SHans Verkuil return true;
908dacca5f0SHans Verkuil }
909dacca5f0SHans Verkuil
fwht_decode_frame(struct fwht_cframe * cf,u32 hdr_flags,unsigned int components_num,unsigned int width,unsigned int height,const struct fwht_raw_frame * ref,unsigned int ref_stride,unsigned int ref_chroma_stride,struct fwht_raw_frame * dst,unsigned int dst_stride,unsigned int dst_chroma_stride)910dacca5f0SHans Verkuil bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
911dacca5f0SHans Verkuil unsigned int components_num, unsigned int width,
912dacca5f0SHans Verkuil unsigned int height, const struct fwht_raw_frame *ref,
913dacca5f0SHans Verkuil unsigned int ref_stride, unsigned int ref_chroma_stride,
914dacca5f0SHans Verkuil struct fwht_raw_frame *dst, unsigned int dst_stride,
915dacca5f0SHans Verkuil unsigned int dst_chroma_stride)
916dacca5f0SHans Verkuil {
917dacca5f0SHans Verkuil const __be16 *rlco = cf->rlc_data;
918dacca5f0SHans Verkuil const __be16 *end_of_rlco_buf = cf->rlc_data +
919dacca5f0SHans Verkuil (cf->size / sizeof(*rlco)) - 1;
920dacca5f0SHans Verkuil
921dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
922dacca5f0SHans Verkuil ref->luma_alpha_step, dst->luma, dst_stride,
923dacca5f0SHans Verkuil dst->luma_alpha_step,
9243abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
925dacca5f0SHans Verkuil end_of_rlco_buf))
926dacca5f0SHans Verkuil return false;
927dacca5f0SHans Verkuil
928dacca5f0SHans Verkuil if (components_num >= 3) {
929dacca5f0SHans Verkuil u32 h = height;
930dacca5f0SHans Verkuil u32 w = width;
931dacca5f0SHans Verkuil
9323abfc314SHans Verkuil if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
933dacca5f0SHans Verkuil h /= 2;
9343abfc314SHans Verkuil if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
935dacca5f0SHans Verkuil w /= 2;
936dacca5f0SHans Verkuil
937dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
938dacca5f0SHans Verkuil ref->chroma_step, dst->cb, dst_chroma_stride,
939dacca5f0SHans Verkuil dst->chroma_step,
9403abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
941dacca5f0SHans Verkuil end_of_rlco_buf))
942dacca5f0SHans Verkuil return false;
943dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
944dacca5f0SHans Verkuil ref->chroma_step, dst->cr, dst_chroma_stride,
945dacca5f0SHans Verkuil dst->chroma_step,
9463abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
947dacca5f0SHans Verkuil end_of_rlco_buf))
948dacca5f0SHans Verkuil return false;
949dacca5f0SHans Verkuil }
950dacca5f0SHans Verkuil
951dacca5f0SHans Verkuil if (components_num == 4)
952dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
953dacca5f0SHans Verkuil ref->luma_alpha_step, dst->alpha, dst_stride,
954dacca5f0SHans Verkuil dst->luma_alpha_step,
9553abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
956dacca5f0SHans Verkuil end_of_rlco_buf))
957dacca5f0SHans Verkuil return false;
958dacca5f0SHans Verkuil return true;
959dacca5f0SHans Verkuil }
960