xref: /openbmc/linux/drivers/media/test-drivers/vicodec/codec-fwht.c (revision 3abfc314c5e60a54973a6f3cefd591bfdad8adf6)
1dacca5f0SHans Verkuil // SPDX-License-Identifier: LGPL-2.1+
2dacca5f0SHans Verkuil /*
3dacca5f0SHans Verkuil  * Copyright 2016 Tom aan de Wiel
4dacca5f0SHans Verkuil  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5dacca5f0SHans Verkuil  *
6dacca5f0SHans Verkuil  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7dacca5f0SHans Verkuil  *
8dacca5f0SHans Verkuil  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9dacca5f0SHans Verkuil  * R.D. Brown, 1977
10dacca5f0SHans Verkuil  */
11dacca5f0SHans Verkuil 
12dacca5f0SHans Verkuil #include <linux/string.h>
13dacca5f0SHans Verkuil #include <linux/kernel.h>
14dacca5f0SHans Verkuil #include "codec-fwht.h"
15dacca5f0SHans Verkuil 
16dacca5f0SHans Verkuil #define OVERFLOW_BIT BIT(14)
17dacca5f0SHans Verkuil 
18dacca5f0SHans Verkuil /*
19dacca5f0SHans Verkuil  * Note: bit 0 of the header must always be 0. Otherwise it cannot
20dacca5f0SHans Verkuil  * be guaranteed that the magic 8 byte sequence (see below) can
21dacca5f0SHans Verkuil  * never occur in the rlc output.
22dacca5f0SHans Verkuil  */
23dacca5f0SHans Verkuil #define PFRAME_BIT BIT(15)
24dacca5f0SHans Verkuil #define DUPS_MASK 0x1ffe
25dacca5f0SHans Verkuil 
26dacca5f0SHans Verkuil #define PBLOCK 0
27dacca5f0SHans Verkuil #define IBLOCK 1
28dacca5f0SHans Verkuil 
29dacca5f0SHans Verkuil #define ALL_ZEROS 15
30dacca5f0SHans Verkuil 
31dacca5f0SHans Verkuil static const uint8_t zigzag[64] = {
32dacca5f0SHans Verkuil 	0,
33dacca5f0SHans Verkuil 	1,  8,
34dacca5f0SHans Verkuil 	2,  9, 16,
35dacca5f0SHans Verkuil 	3, 10, 17, 24,
36dacca5f0SHans Verkuil 	4, 11, 18, 25, 32,
37dacca5f0SHans Verkuil 	5, 12, 19, 26, 33, 40,
38dacca5f0SHans Verkuil 	6, 13, 20, 27, 34, 41, 48,
39dacca5f0SHans Verkuil 	7, 14, 21, 28, 35, 42, 49, 56,
40dacca5f0SHans Verkuil 	15, 22, 29, 36, 43, 50, 57,
41dacca5f0SHans Verkuil 	23, 30, 37, 44, 51, 58,
42dacca5f0SHans Verkuil 	31, 38, 45, 52, 59,
43dacca5f0SHans Verkuil 	39, 46, 53, 60,
44dacca5f0SHans Verkuil 	47, 54, 61,
45dacca5f0SHans Verkuil 	55, 62,
46dacca5f0SHans Verkuil 	63,
47dacca5f0SHans Verkuil };
48dacca5f0SHans Verkuil 
49dacca5f0SHans Verkuil /*
50dacca5f0SHans Verkuil  * noinline_for_stack to work around
51dacca5f0SHans Verkuil  * https://bugs.llvm.org/show_bug.cgi?id=38809
52dacca5f0SHans Verkuil  */
53dacca5f0SHans Verkuil static int noinline_for_stack
54dacca5f0SHans Verkuil rlc(const s16 *in, __be16 *output, int blocktype)
55dacca5f0SHans Verkuil {
56dacca5f0SHans Verkuil 	s16 block[8 * 8];
57dacca5f0SHans Verkuil 	s16 *wp = block;
58dacca5f0SHans Verkuil 	int i = 0;
59dacca5f0SHans Verkuil 	int x, y;
60dacca5f0SHans Verkuil 	int ret = 0;
61dacca5f0SHans Verkuil 
62dacca5f0SHans Verkuil 	/* read in block from framebuffer */
63dacca5f0SHans Verkuil 	int lastzero_run = 0;
64dacca5f0SHans Verkuil 	int to_encode;
65dacca5f0SHans Verkuil 
66dacca5f0SHans Verkuil 	for (y = 0; y < 8; y++) {
67dacca5f0SHans Verkuil 		for (x = 0; x < 8; x++) {
68dacca5f0SHans Verkuil 			*wp = in[x + y * 8];
69dacca5f0SHans Verkuil 			wp++;
70dacca5f0SHans Verkuil 		}
71dacca5f0SHans Verkuil 	}
72dacca5f0SHans Verkuil 
73dacca5f0SHans Verkuil 	/* keep track of amount of trailing zeros */
74dacca5f0SHans Verkuil 	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
75dacca5f0SHans Verkuil 		lastzero_run++;
76dacca5f0SHans Verkuil 
77dacca5f0SHans Verkuil 	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
78dacca5f0SHans Verkuil 	ret++;
79dacca5f0SHans Verkuil 
80dacca5f0SHans Verkuil 	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
81dacca5f0SHans Verkuil 
82dacca5f0SHans Verkuil 	i = 0;
83dacca5f0SHans Verkuil 	while (i < to_encode) {
84dacca5f0SHans Verkuil 		int cnt = 0;
85dacca5f0SHans Verkuil 		int tmp;
86dacca5f0SHans Verkuil 
87dacca5f0SHans Verkuil 		/* count leading zeros */
88dacca5f0SHans Verkuil 		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
89dacca5f0SHans Verkuil 			cnt++;
90dacca5f0SHans Verkuil 			i++;
91dacca5f0SHans Verkuil 			if (i == to_encode) {
92dacca5f0SHans Verkuil 				cnt--;
93dacca5f0SHans Verkuil 				break;
94dacca5f0SHans Verkuil 			}
95dacca5f0SHans Verkuil 		}
96dacca5f0SHans Verkuil 		/* 4 bits for run, 12 for coefficient (quantization by 4) */
97dacca5f0SHans Verkuil 		*output++ = htons((cnt | tmp << 4));
98dacca5f0SHans Verkuil 		i++;
99dacca5f0SHans Verkuil 		ret++;
100dacca5f0SHans Verkuil 	}
101dacca5f0SHans Verkuil 	if (lastzero_run > 14) {
102dacca5f0SHans Verkuil 		*output = htons(ALL_ZEROS | 0);
103dacca5f0SHans Verkuil 		ret++;
104dacca5f0SHans Verkuil 	}
105dacca5f0SHans Verkuil 
106dacca5f0SHans Verkuil 	return ret;
107dacca5f0SHans Verkuil }
108dacca5f0SHans Verkuil 
109dacca5f0SHans Verkuil /*
110dacca5f0SHans Verkuil  * This function will worst-case increase rlc_in by 65*2 bytes:
111dacca5f0SHans Verkuil  * one s16 value for the header and 8 * 8 coefficients of type s16.
112dacca5f0SHans Verkuil  */
113dacca5f0SHans Verkuil static noinline_for_stack u16
114dacca5f0SHans Verkuil derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
115dacca5f0SHans Verkuil {
116dacca5f0SHans Verkuil 	/* header */
117dacca5f0SHans Verkuil 	const __be16 *input = *rlc_in;
118dacca5f0SHans Verkuil 	u16 stat;
119dacca5f0SHans Verkuil 	int dec_count = 0;
120dacca5f0SHans Verkuil 	s16 block[8 * 8 + 16];
121dacca5f0SHans Verkuil 	s16 *wp = block;
122dacca5f0SHans Verkuil 	int i;
123dacca5f0SHans Verkuil 
124dacca5f0SHans Verkuil 	if (input > end_of_input)
125dacca5f0SHans Verkuil 		return OVERFLOW_BIT;
126dacca5f0SHans Verkuil 	stat = ntohs(*input++);
127dacca5f0SHans Verkuil 
128dacca5f0SHans Verkuil 	/*
129dacca5f0SHans Verkuil 	 * Now de-compress, it expands one byte to up to 15 bytes
130dacca5f0SHans Verkuil 	 * (or fills the remainder of the 64 bytes with zeroes if it
131dacca5f0SHans Verkuil 	 * is the last byte to expand).
132dacca5f0SHans Verkuil 	 *
133dacca5f0SHans Verkuil 	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
134dacca5f0SHans Verkuil 	 * allow for overflow if the incoming data was malformed.
135dacca5f0SHans Verkuil 	 */
136dacca5f0SHans Verkuil 	while (dec_count < 8 * 8) {
137dacca5f0SHans Verkuil 		s16 in;
138dacca5f0SHans Verkuil 		int length;
139dacca5f0SHans Verkuil 		int coeff;
140dacca5f0SHans Verkuil 
141dacca5f0SHans Verkuil 		if (input > end_of_input)
142dacca5f0SHans Verkuil 			return OVERFLOW_BIT;
143dacca5f0SHans Verkuil 		in = ntohs(*input++);
144dacca5f0SHans Verkuil 		length = in & 0xf;
145dacca5f0SHans Verkuil 		coeff = in >> 4;
146dacca5f0SHans Verkuil 
147dacca5f0SHans Verkuil 		/* fill remainder with zeros */
148dacca5f0SHans Verkuil 		if (length == 15) {
149dacca5f0SHans Verkuil 			for (i = 0; i < 64 - dec_count; i++)
150dacca5f0SHans Verkuil 				*wp++ = 0;
151dacca5f0SHans Verkuil 			break;
152dacca5f0SHans Verkuil 		}
153dacca5f0SHans Verkuil 
154dacca5f0SHans Verkuil 		for (i = 0; i < length; i++)
155dacca5f0SHans Verkuil 			*wp++ = 0;
156dacca5f0SHans Verkuil 		*wp++ = coeff;
157dacca5f0SHans Verkuil 		dec_count += length + 1;
158dacca5f0SHans Verkuil 	}
159dacca5f0SHans Verkuil 
160dacca5f0SHans Verkuil 	wp = block;
161dacca5f0SHans Verkuil 
162dacca5f0SHans Verkuil 	for (i = 0; i < 64; i++) {
163dacca5f0SHans Verkuil 		int pos = zigzag[i];
164dacca5f0SHans Verkuil 		int y = pos / 8;
165dacca5f0SHans Verkuil 		int x = pos % 8;
166dacca5f0SHans Verkuil 
167dacca5f0SHans Verkuil 		dwht_out[x + y * 8] = *wp++;
168dacca5f0SHans Verkuil 	}
169dacca5f0SHans Verkuil 	*rlc_in = input;
170dacca5f0SHans Verkuil 	return stat;
171dacca5f0SHans Verkuil }
172dacca5f0SHans Verkuil 
173dacca5f0SHans Verkuil static const int quant_table[] = {
174dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  2,  2,
175dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  2,  2,
176dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  2,  3,
177dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  3,  6,
178dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 3,  6,  6,
179dacca5f0SHans Verkuil 	2, 2, 2, 2, 3, 6,  6,  6,
180dacca5f0SHans Verkuil 	2, 2, 2, 3, 6, 6,  6,  6,
181dacca5f0SHans Verkuil 	2, 2, 3, 6, 6, 6,  6,  8,
182dacca5f0SHans Verkuil };
183dacca5f0SHans Verkuil 
184dacca5f0SHans Verkuil static const int quant_table_p[] = {
185dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  3,
186dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  3,
187dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  3,
188dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  6,
189dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  6,  6,
190dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 6,  6,  9,
191dacca5f0SHans Verkuil 	3, 3, 3, 3, 6, 6,  9,  9,
192dacca5f0SHans Verkuil 	3, 3, 3, 6, 6, 9,  9,  10,
193dacca5f0SHans Verkuil };
194dacca5f0SHans Verkuil 
195dacca5f0SHans Verkuil static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
196dacca5f0SHans Verkuil {
197dacca5f0SHans Verkuil 	const int *quant = quant_table;
198dacca5f0SHans Verkuil 	int i, j;
199dacca5f0SHans Verkuil 
200dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++) {
201dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
202dacca5f0SHans Verkuil 			*coeff >>= *quant;
203dacca5f0SHans Verkuil 			if (*coeff >= -qp && *coeff <= qp)
204dacca5f0SHans Verkuil 				*coeff = *de_coeff = 0;
205dacca5f0SHans Verkuil 			else
206dacca5f0SHans Verkuil 				*de_coeff = *coeff << *quant;
207dacca5f0SHans Verkuil 		}
208dacca5f0SHans Verkuil 	}
209dacca5f0SHans Verkuil }
210dacca5f0SHans Verkuil 
211dacca5f0SHans Verkuil static void dequantize_intra(s16 *coeff)
212dacca5f0SHans Verkuil {
213dacca5f0SHans Verkuil 	const int *quant = quant_table;
214dacca5f0SHans Verkuil 	int i, j;
215dacca5f0SHans Verkuil 
216dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++)
217dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++)
218dacca5f0SHans Verkuil 			*coeff <<= *quant;
219dacca5f0SHans Verkuil }
220dacca5f0SHans Verkuil 
221dacca5f0SHans Verkuil static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
222dacca5f0SHans Verkuil {
223dacca5f0SHans Verkuil 	const int *quant = quant_table_p;
224dacca5f0SHans Verkuil 	int i, j;
225dacca5f0SHans Verkuil 
226dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++) {
227dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
228dacca5f0SHans Verkuil 			*coeff >>= *quant;
229dacca5f0SHans Verkuil 			if (*coeff >= -qp && *coeff <= qp)
230dacca5f0SHans Verkuil 				*coeff = *de_coeff = 0;
231dacca5f0SHans Verkuil 			else
232dacca5f0SHans Verkuil 				*de_coeff = *coeff << *quant;
233dacca5f0SHans Verkuil 		}
234dacca5f0SHans Verkuil 	}
235dacca5f0SHans Verkuil }
236dacca5f0SHans Verkuil 
237dacca5f0SHans Verkuil static void dequantize_inter(s16 *coeff)
238dacca5f0SHans Verkuil {
239dacca5f0SHans Verkuil 	const int *quant = quant_table_p;
240dacca5f0SHans Verkuil 	int i, j;
241dacca5f0SHans Verkuil 
242dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++)
243dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++)
244dacca5f0SHans Verkuil 			*coeff <<= *quant;
245dacca5f0SHans Verkuil }
246dacca5f0SHans Verkuil 
247dacca5f0SHans Verkuil static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
248dacca5f0SHans Verkuil 				    unsigned int stride,
249dacca5f0SHans Verkuil 				    unsigned int input_step, bool intra)
250dacca5f0SHans Verkuil {
251dacca5f0SHans Verkuil 	/* we'll need more than 8 bits for the transformed coefficients */
252dacca5f0SHans Verkuil 	s32 workspace1[8], workspace2[8];
253dacca5f0SHans Verkuil 	const u8 *tmp = block;
254dacca5f0SHans Verkuil 	s16 *out = output_block;
255dacca5f0SHans Verkuil 	int add = intra ? 256 : 0;
256dacca5f0SHans Verkuil 	unsigned int i;
257dacca5f0SHans Verkuil 
258dacca5f0SHans Verkuil 	/* stage 1 */
259dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
260dacca5f0SHans Verkuil 		switch (input_step) {
261dacca5f0SHans Verkuil 		case 1:
262dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[1] - add;
263dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[1];
264dacca5f0SHans Verkuil 
265dacca5f0SHans Verkuil 			workspace1[2]  = tmp[2] + tmp[3] - add;
266dacca5f0SHans Verkuil 			workspace1[3]  = tmp[2] - tmp[3];
267dacca5f0SHans Verkuil 
268dacca5f0SHans Verkuil 			workspace1[4]  = tmp[4] + tmp[5] - add;
269dacca5f0SHans Verkuil 			workspace1[5]  = tmp[4] - tmp[5];
270dacca5f0SHans Verkuil 
271dacca5f0SHans Verkuil 			workspace1[6]  = tmp[6] + tmp[7] - add;
272dacca5f0SHans Verkuil 			workspace1[7]  = tmp[6] - tmp[7];
273dacca5f0SHans Verkuil 			break;
274dacca5f0SHans Verkuil 		case 2:
275dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[2] - add;
276dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[2];
277dacca5f0SHans Verkuil 
278dacca5f0SHans Verkuil 			workspace1[2]  = tmp[4] + tmp[6] - add;
279dacca5f0SHans Verkuil 			workspace1[3]  = tmp[4] - tmp[6];
280dacca5f0SHans Verkuil 
281dacca5f0SHans Verkuil 			workspace1[4]  = tmp[8] + tmp[10] - add;
282dacca5f0SHans Verkuil 			workspace1[5]  = tmp[8] - tmp[10];
283dacca5f0SHans Verkuil 
284dacca5f0SHans Verkuil 			workspace1[6]  = tmp[12] + tmp[14] - add;
285dacca5f0SHans Verkuil 			workspace1[7]  = tmp[12] - tmp[14];
286dacca5f0SHans Verkuil 			break;
287dacca5f0SHans Verkuil 		case 3:
288dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[3] - add;
289dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[3];
290dacca5f0SHans Verkuil 
291dacca5f0SHans Verkuil 			workspace1[2]  = tmp[6] + tmp[9] - add;
292dacca5f0SHans Verkuil 			workspace1[3]  = tmp[6] - tmp[9];
293dacca5f0SHans Verkuil 
294dacca5f0SHans Verkuil 			workspace1[4]  = tmp[12] + tmp[15] - add;
295dacca5f0SHans Verkuil 			workspace1[5]  = tmp[12] - tmp[15];
296dacca5f0SHans Verkuil 
297dacca5f0SHans Verkuil 			workspace1[6]  = tmp[18] + tmp[21] - add;
298dacca5f0SHans Verkuil 			workspace1[7]  = tmp[18] - tmp[21];
299dacca5f0SHans Verkuil 			break;
300dacca5f0SHans Verkuil 		default:
301dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[4] - add;
302dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[4];
303dacca5f0SHans Verkuil 
304dacca5f0SHans Verkuil 			workspace1[2]  = tmp[8] + tmp[12] - add;
305dacca5f0SHans Verkuil 			workspace1[3]  = tmp[8] - tmp[12];
306dacca5f0SHans Verkuil 
307dacca5f0SHans Verkuil 			workspace1[4]  = tmp[16] + tmp[20] - add;
308dacca5f0SHans Verkuil 			workspace1[5]  = tmp[16] - tmp[20];
309dacca5f0SHans Verkuil 
310dacca5f0SHans Verkuil 			workspace1[6]  = tmp[24] + tmp[28] - add;
311dacca5f0SHans Verkuil 			workspace1[7]  = tmp[24] - tmp[28];
312dacca5f0SHans Verkuil 			break;
313dacca5f0SHans Verkuil 		}
314dacca5f0SHans Verkuil 
315dacca5f0SHans Verkuil 		/* stage 2 */
316dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
317dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
318dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
319dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
320dacca5f0SHans Verkuil 
321dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
322dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
323dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
324dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
325dacca5f0SHans Verkuil 
326dacca5f0SHans Verkuil 		/* stage 3 */
327dacca5f0SHans Verkuil 		out[0] = workspace2[0] + workspace2[4];
328dacca5f0SHans Verkuil 		out[1] = workspace2[0] - workspace2[4];
329dacca5f0SHans Verkuil 		out[2] = workspace2[1] - workspace2[5];
330dacca5f0SHans Verkuil 		out[3] = workspace2[1] + workspace2[5];
331dacca5f0SHans Verkuil 		out[4] = workspace2[2] + workspace2[6];
332dacca5f0SHans Verkuil 		out[5] = workspace2[2] - workspace2[6];
333dacca5f0SHans Verkuil 		out[6] = workspace2[3] - workspace2[7];
334dacca5f0SHans Verkuil 		out[7] = workspace2[3] + workspace2[7];
335dacca5f0SHans Verkuil 	}
336dacca5f0SHans Verkuil 
337dacca5f0SHans Verkuil 	out = output_block;
338dacca5f0SHans Verkuil 
339dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, out++) {
340dacca5f0SHans Verkuil 		/* stage 1 */
341dacca5f0SHans Verkuil 		workspace1[0]  = out[0] + out[1 * 8];
342dacca5f0SHans Verkuil 		workspace1[1]  = out[0] - out[1 * 8];
343dacca5f0SHans Verkuil 
344dacca5f0SHans Verkuil 		workspace1[2]  = out[2 * 8] + out[3 * 8];
345dacca5f0SHans Verkuil 		workspace1[3]  = out[2 * 8] - out[3 * 8];
346dacca5f0SHans Verkuil 
347dacca5f0SHans Verkuil 		workspace1[4]  = out[4 * 8] + out[5 * 8];
348dacca5f0SHans Verkuil 		workspace1[5]  = out[4 * 8] - out[5 * 8];
349dacca5f0SHans Verkuil 
350dacca5f0SHans Verkuil 		workspace1[6]  = out[6 * 8] + out[7 * 8];
351dacca5f0SHans Verkuil 		workspace1[7]  = out[6 * 8] - out[7 * 8];
352dacca5f0SHans Verkuil 
353dacca5f0SHans Verkuil 		/* stage 2 */
354dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
355dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
356dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
357dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
358dacca5f0SHans Verkuil 
359dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
360dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
361dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
362dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
363dacca5f0SHans Verkuil 		/* stage 3 */
364dacca5f0SHans Verkuil 		out[0 * 8] = workspace2[0] + workspace2[4];
365dacca5f0SHans Verkuil 		out[1 * 8] = workspace2[0] - workspace2[4];
366dacca5f0SHans Verkuil 		out[2 * 8] = workspace2[1] - workspace2[5];
367dacca5f0SHans Verkuil 		out[3 * 8] = workspace2[1] + workspace2[5];
368dacca5f0SHans Verkuil 		out[4 * 8] = workspace2[2] + workspace2[6];
369dacca5f0SHans Verkuil 		out[5 * 8] = workspace2[2] - workspace2[6];
370dacca5f0SHans Verkuil 		out[6 * 8] = workspace2[3] - workspace2[7];
371dacca5f0SHans Verkuil 		out[7 * 8] = workspace2[3] + workspace2[7];
372dacca5f0SHans Verkuil 	}
373dacca5f0SHans Verkuil }
374dacca5f0SHans Verkuil 
375dacca5f0SHans Verkuil /*
376dacca5f0SHans Verkuil  * Not the nicest way of doing it, but P-blocks get twice the range of
377dacca5f0SHans Verkuil  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
378dacca5f0SHans Verkuil  * Furthermore values can be negative... This is just a version that
379dacca5f0SHans Verkuil  * works with 16 signed data
380dacca5f0SHans Verkuil  */
381dacca5f0SHans Verkuil static void noinline_for_stack
382dacca5f0SHans Verkuil fwht16(const s16 *block, s16 *output_block, int stride, int intra)
383dacca5f0SHans Verkuil {
384dacca5f0SHans Verkuil 	/* we'll need more than 8 bits for the transformed coefficients */
385dacca5f0SHans Verkuil 	s32 workspace1[8], workspace2[8];
386dacca5f0SHans Verkuil 	const s16 *tmp = block;
387dacca5f0SHans Verkuil 	s16 *out = output_block;
388dacca5f0SHans Verkuil 	int i;
389dacca5f0SHans Verkuil 
390dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
391dacca5f0SHans Verkuil 		/* stage 1 */
392dacca5f0SHans Verkuil 		workspace1[0]  = tmp[0] + tmp[1];
393dacca5f0SHans Verkuil 		workspace1[1]  = tmp[0] - tmp[1];
394dacca5f0SHans Verkuil 
395dacca5f0SHans Verkuil 		workspace1[2]  = tmp[2] + tmp[3];
396dacca5f0SHans Verkuil 		workspace1[3]  = tmp[2] - tmp[3];
397dacca5f0SHans Verkuil 
398dacca5f0SHans Verkuil 		workspace1[4]  = tmp[4] + tmp[5];
399dacca5f0SHans Verkuil 		workspace1[5]  = tmp[4] - tmp[5];
400dacca5f0SHans Verkuil 
401dacca5f0SHans Verkuil 		workspace1[6]  = tmp[6] + tmp[7];
402dacca5f0SHans Verkuil 		workspace1[7]  = tmp[6] - tmp[7];
403dacca5f0SHans Verkuil 
404dacca5f0SHans Verkuil 		/* stage 2 */
405dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
406dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
407dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
408dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
409dacca5f0SHans Verkuil 
410dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
411dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
412dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
413dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
414dacca5f0SHans Verkuil 
415dacca5f0SHans Verkuil 		/* stage 3 */
416dacca5f0SHans Verkuil 		out[0] = workspace2[0] + workspace2[4];
417dacca5f0SHans Verkuil 		out[1] = workspace2[0] - workspace2[4];
418dacca5f0SHans Verkuil 		out[2] = workspace2[1] - workspace2[5];
419dacca5f0SHans Verkuil 		out[3] = workspace2[1] + workspace2[5];
420dacca5f0SHans Verkuil 		out[4] = workspace2[2] + workspace2[6];
421dacca5f0SHans Verkuil 		out[5] = workspace2[2] - workspace2[6];
422dacca5f0SHans Verkuil 		out[6] = workspace2[3] - workspace2[7];
423dacca5f0SHans Verkuil 		out[7] = workspace2[3] + workspace2[7];
424dacca5f0SHans Verkuil 	}
425dacca5f0SHans Verkuil 
426dacca5f0SHans Verkuil 	out = output_block;
427dacca5f0SHans Verkuil 
428dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, out++) {
429dacca5f0SHans Verkuil 		/* stage 1 */
430dacca5f0SHans Verkuil 		workspace1[0]  = out[0] + out[1*8];
431dacca5f0SHans Verkuil 		workspace1[1]  = out[0] - out[1*8];
432dacca5f0SHans Verkuil 
433dacca5f0SHans Verkuil 		workspace1[2]  = out[2*8] + out[3*8];
434dacca5f0SHans Verkuil 		workspace1[3]  = out[2*8] - out[3*8];
435dacca5f0SHans Verkuil 
436dacca5f0SHans Verkuil 		workspace1[4]  = out[4*8] + out[5*8];
437dacca5f0SHans Verkuil 		workspace1[5]  = out[4*8] - out[5*8];
438dacca5f0SHans Verkuil 
439dacca5f0SHans Verkuil 		workspace1[6]  = out[6*8] + out[7*8];
440dacca5f0SHans Verkuil 		workspace1[7]  = out[6*8] - out[7*8];
441dacca5f0SHans Verkuil 
442dacca5f0SHans Verkuil 		/* stage 2 */
443dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
444dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
445dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
446dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
447dacca5f0SHans Verkuil 
448dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
449dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
450dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
451dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
452dacca5f0SHans Verkuil 
453dacca5f0SHans Verkuil 		/* stage 3 */
454dacca5f0SHans Verkuil 		out[0*8] = workspace2[0] + workspace2[4];
455dacca5f0SHans Verkuil 		out[1*8] = workspace2[0] - workspace2[4];
456dacca5f0SHans Verkuil 		out[2*8] = workspace2[1] - workspace2[5];
457dacca5f0SHans Verkuil 		out[3*8] = workspace2[1] + workspace2[5];
458dacca5f0SHans Verkuil 		out[4*8] = workspace2[2] + workspace2[6];
459dacca5f0SHans Verkuil 		out[5*8] = workspace2[2] - workspace2[6];
460dacca5f0SHans Verkuil 		out[6*8] = workspace2[3] - workspace2[7];
461dacca5f0SHans Verkuil 		out[7*8] = workspace2[3] + workspace2[7];
462dacca5f0SHans Verkuil 	}
463dacca5f0SHans Verkuil }
464dacca5f0SHans Verkuil 
465dacca5f0SHans Verkuil static noinline_for_stack void
466dacca5f0SHans Verkuil ifwht(const s16 *block, s16 *output_block, int intra)
467dacca5f0SHans Verkuil {
468dacca5f0SHans Verkuil 	/*
469dacca5f0SHans Verkuil 	 * we'll need more than 8 bits for the transformed coefficients
470dacca5f0SHans Verkuil 	 * use native unit of cpu
471dacca5f0SHans Verkuil 	 */
472dacca5f0SHans Verkuil 	int workspace1[8], workspace2[8];
473dacca5f0SHans Verkuil 	int inter = intra ? 0 : 1;
474dacca5f0SHans Verkuil 	const s16 *tmp = block;
475dacca5f0SHans Verkuil 	s16 *out = output_block;
476dacca5f0SHans Verkuil 	int i;
477dacca5f0SHans Verkuil 
478dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
479dacca5f0SHans Verkuil 		/* stage 1 */
480dacca5f0SHans Verkuil 		workspace1[0]  = tmp[0] + tmp[1];
481dacca5f0SHans Verkuil 		workspace1[1]  = tmp[0] - tmp[1];
482dacca5f0SHans Verkuil 
483dacca5f0SHans Verkuil 		workspace1[2]  = tmp[2] + tmp[3];
484dacca5f0SHans Verkuil 		workspace1[3]  = tmp[2] - tmp[3];
485dacca5f0SHans Verkuil 
486dacca5f0SHans Verkuil 		workspace1[4]  = tmp[4] + tmp[5];
487dacca5f0SHans Verkuil 		workspace1[5]  = tmp[4] - tmp[5];
488dacca5f0SHans Verkuil 
489dacca5f0SHans Verkuil 		workspace1[6]  = tmp[6] + tmp[7];
490dacca5f0SHans Verkuil 		workspace1[7]  = tmp[6] - tmp[7];
491dacca5f0SHans Verkuil 
492dacca5f0SHans Verkuil 		/* stage 2 */
493dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
494dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
495dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
496dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
497dacca5f0SHans Verkuil 
498dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
499dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
500dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
501dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
502dacca5f0SHans Verkuil 
503dacca5f0SHans Verkuil 		/* stage 3 */
504dacca5f0SHans Verkuil 		out[0] = workspace2[0] + workspace2[4];
505dacca5f0SHans Verkuil 		out[1] = workspace2[0] - workspace2[4];
506dacca5f0SHans Verkuil 		out[2] = workspace2[1] - workspace2[5];
507dacca5f0SHans Verkuil 		out[3] = workspace2[1] + workspace2[5];
508dacca5f0SHans Verkuil 		out[4] = workspace2[2] + workspace2[6];
509dacca5f0SHans Verkuil 		out[5] = workspace2[2] - workspace2[6];
510dacca5f0SHans Verkuil 		out[6] = workspace2[3] - workspace2[7];
511dacca5f0SHans Verkuil 		out[7] = workspace2[3] + workspace2[7];
512dacca5f0SHans Verkuil 	}
513dacca5f0SHans Verkuil 
514dacca5f0SHans Verkuil 	out = output_block;
515dacca5f0SHans Verkuil 
516dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, out++) {
517dacca5f0SHans Verkuil 		/* stage 1 */
518dacca5f0SHans Verkuil 		workspace1[0]  = out[0] + out[1 * 8];
519dacca5f0SHans Verkuil 		workspace1[1]  = out[0] - out[1 * 8];
520dacca5f0SHans Verkuil 
521dacca5f0SHans Verkuil 		workspace1[2]  = out[2 * 8] + out[3 * 8];
522dacca5f0SHans Verkuil 		workspace1[3]  = out[2 * 8] - out[3 * 8];
523dacca5f0SHans Verkuil 
524dacca5f0SHans Verkuil 		workspace1[4]  = out[4 * 8] + out[5 * 8];
525dacca5f0SHans Verkuil 		workspace1[5]  = out[4 * 8] - out[5 * 8];
526dacca5f0SHans Verkuil 
527dacca5f0SHans Verkuil 		workspace1[6]  = out[6 * 8] + out[7 * 8];
528dacca5f0SHans Verkuil 		workspace1[7]  = out[6 * 8] - out[7 * 8];
529dacca5f0SHans Verkuil 
530dacca5f0SHans Verkuil 		/* stage 2 */
531dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
532dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
533dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
534dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
535dacca5f0SHans Verkuil 
536dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
537dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
538dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
539dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
540dacca5f0SHans Verkuil 
541dacca5f0SHans Verkuil 		/* stage 3 */
542dacca5f0SHans Verkuil 		if (inter) {
543dacca5f0SHans Verkuil 			int d;
544dacca5f0SHans Verkuil 
545dacca5f0SHans Verkuil 			out[0 * 8] = workspace2[0] + workspace2[4];
546dacca5f0SHans Verkuil 			out[1 * 8] = workspace2[0] - workspace2[4];
547dacca5f0SHans Verkuil 			out[2 * 8] = workspace2[1] - workspace2[5];
548dacca5f0SHans Verkuil 			out[3 * 8] = workspace2[1] + workspace2[5];
549dacca5f0SHans Verkuil 			out[4 * 8] = workspace2[2] + workspace2[6];
550dacca5f0SHans Verkuil 			out[5 * 8] = workspace2[2] - workspace2[6];
551dacca5f0SHans Verkuil 			out[6 * 8] = workspace2[3] - workspace2[7];
552dacca5f0SHans Verkuil 			out[7 * 8] = workspace2[3] + workspace2[7];
553dacca5f0SHans Verkuil 
554dacca5f0SHans Verkuil 			for (d = 0; d < 8; d++)
555dacca5f0SHans Verkuil 				out[8 * d] >>= 6;
556dacca5f0SHans Verkuil 		} else {
557dacca5f0SHans Verkuil 			int d;
558dacca5f0SHans Verkuil 
559dacca5f0SHans Verkuil 			out[0 * 8] = workspace2[0] + workspace2[4];
560dacca5f0SHans Verkuil 			out[1 * 8] = workspace2[0] - workspace2[4];
561dacca5f0SHans Verkuil 			out[2 * 8] = workspace2[1] - workspace2[5];
562dacca5f0SHans Verkuil 			out[3 * 8] = workspace2[1] + workspace2[5];
563dacca5f0SHans Verkuil 			out[4 * 8] = workspace2[2] + workspace2[6];
564dacca5f0SHans Verkuil 			out[5 * 8] = workspace2[2] - workspace2[6];
565dacca5f0SHans Verkuil 			out[6 * 8] = workspace2[3] - workspace2[7];
566dacca5f0SHans Verkuil 			out[7 * 8] = workspace2[3] + workspace2[7];
567dacca5f0SHans Verkuil 
568dacca5f0SHans Verkuil 			for (d = 0; d < 8; d++) {
569dacca5f0SHans Verkuil 				out[8 * d] >>= 6;
570dacca5f0SHans Verkuil 				out[8 * d] += 128;
571dacca5f0SHans Verkuil 			}
572dacca5f0SHans Verkuil 		}
573dacca5f0SHans Verkuil 	}
574dacca5f0SHans Verkuil }
575dacca5f0SHans Verkuil 
576dacca5f0SHans Verkuil static void fill_encoder_block(const u8 *input, s16 *dst,
577dacca5f0SHans Verkuil 			       unsigned int stride, unsigned int input_step)
578dacca5f0SHans Verkuil {
579dacca5f0SHans Verkuil 	int i, j;
580dacca5f0SHans Verkuil 
581dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++) {
582dacca5f0SHans Verkuil 		for (j = 0; j < 8; j++, input += input_step)
583dacca5f0SHans Verkuil 			*dst++ = *input;
584dacca5f0SHans Verkuil 		input += stride - 8 * input_step;
585dacca5f0SHans Verkuil 	}
586dacca5f0SHans Verkuil }
587dacca5f0SHans Verkuil 
588dacca5f0SHans Verkuil static int var_intra(const s16 *input)
589dacca5f0SHans Verkuil {
590dacca5f0SHans Verkuil 	int32_t mean = 0;
591dacca5f0SHans Verkuil 	int32_t ret = 0;
592dacca5f0SHans Verkuil 	const s16 *tmp = input;
593dacca5f0SHans Verkuil 	int i;
594dacca5f0SHans Verkuil 
595dacca5f0SHans Verkuil 	for (i = 0; i < 8 * 8; i++, tmp++)
596dacca5f0SHans Verkuil 		mean += *tmp;
597dacca5f0SHans Verkuil 	mean /= 64;
598dacca5f0SHans Verkuil 	tmp = input;
599dacca5f0SHans Verkuil 	for (i = 0; i < 8 * 8; i++, tmp++)
600dacca5f0SHans Verkuil 		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
601dacca5f0SHans Verkuil 	return ret;
602dacca5f0SHans Verkuil }
603dacca5f0SHans Verkuil 
604dacca5f0SHans Verkuil static int var_inter(const s16 *old, const s16 *new)
605dacca5f0SHans Verkuil {
606dacca5f0SHans Verkuil 	int32_t ret = 0;
607dacca5f0SHans Verkuil 	int i;
608dacca5f0SHans Verkuil 
609dacca5f0SHans Verkuil 	for (i = 0; i < 8 * 8; i++, old++, new++)
610dacca5f0SHans Verkuil 		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
611dacca5f0SHans Verkuil 	return ret;
612dacca5f0SHans Verkuil }
613dacca5f0SHans Verkuil 
614dacca5f0SHans Verkuil static noinline_for_stack int
615dacca5f0SHans Verkuil decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
616dacca5f0SHans Verkuil 		 unsigned int stride, unsigned int input_step)
617dacca5f0SHans Verkuil {
618dacca5f0SHans Verkuil 	s16 tmp[64];
619dacca5f0SHans Verkuil 	s16 old[64];
620dacca5f0SHans Verkuil 	s16 *work = tmp;
621dacca5f0SHans Verkuil 	unsigned int k, l;
622dacca5f0SHans Verkuil 	int vari;
623dacca5f0SHans Verkuil 	int vard;
624dacca5f0SHans Verkuil 
625dacca5f0SHans Verkuil 	fill_encoder_block(cur, tmp, stride, input_step);
626dacca5f0SHans Verkuil 	fill_encoder_block(reference, old, 8, 1);
627dacca5f0SHans Verkuil 	vari = var_intra(tmp);
628dacca5f0SHans Verkuil 
629dacca5f0SHans Verkuil 	for (k = 0; k < 8; k++) {
630dacca5f0SHans Verkuil 		for (l = 0; l < 8; l++) {
631dacca5f0SHans Verkuil 			*deltablock = *work - *reference;
632dacca5f0SHans Verkuil 			deltablock++;
633dacca5f0SHans Verkuil 			work++;
634dacca5f0SHans Verkuil 			reference++;
635dacca5f0SHans Verkuil 		}
636dacca5f0SHans Verkuil 	}
637dacca5f0SHans Verkuil 	deltablock -= 64;
638dacca5f0SHans Verkuil 	vard = var_inter(old, tmp);
639dacca5f0SHans Verkuil 	return vari <= vard ? IBLOCK : PBLOCK;
640dacca5f0SHans Verkuil }
641dacca5f0SHans Verkuil 
642dacca5f0SHans Verkuil static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
643dacca5f0SHans Verkuil 			       unsigned int dst_step)
644dacca5f0SHans Verkuil {
645dacca5f0SHans Verkuil 	int i, j;
646dacca5f0SHans Verkuil 
647dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++) {
648dacca5f0SHans Verkuil 		for (j = 0; j < 8; j++, input++, dst += dst_step) {
649dacca5f0SHans Verkuil 			if (*input < 0)
650dacca5f0SHans Verkuil 				*dst = 0;
651dacca5f0SHans Verkuil 			else if (*input > 255)
652dacca5f0SHans Verkuil 				*dst = 255;
653dacca5f0SHans Verkuil 			else
654dacca5f0SHans Verkuil 				*dst = *input;
655dacca5f0SHans Verkuil 		}
656dacca5f0SHans Verkuil 		dst += stride - (8 * dst_step);
657dacca5f0SHans Verkuil 	}
658dacca5f0SHans Verkuil }
659dacca5f0SHans Verkuil 
660dacca5f0SHans Verkuil static void add_deltas(s16 *deltas, const u8 *ref, int stride,
661dacca5f0SHans Verkuil 		       unsigned int ref_step)
662dacca5f0SHans Verkuil {
663dacca5f0SHans Verkuil 	int k, l;
664dacca5f0SHans Verkuil 
665dacca5f0SHans Verkuil 	for (k = 0; k < 8; k++) {
666dacca5f0SHans Verkuil 		for (l = 0; l < 8; l++) {
667dacca5f0SHans Verkuil 			*deltas += *ref;
668dacca5f0SHans Verkuil 			ref += ref_step;
669dacca5f0SHans Verkuil 			/*
670dacca5f0SHans Verkuil 			 * Due to quantizing, it might possible that the
671dacca5f0SHans Verkuil 			 * decoded coefficients are slightly out of range
672dacca5f0SHans Verkuil 			 */
673dacca5f0SHans Verkuil 			if (*deltas < 0)
674dacca5f0SHans Verkuil 				*deltas = 0;
675dacca5f0SHans Verkuil 			else if (*deltas > 255)
676dacca5f0SHans Verkuil 				*deltas = 255;
677dacca5f0SHans Verkuil 			deltas++;
678dacca5f0SHans Verkuil 		}
679dacca5f0SHans Verkuil 		ref += stride - (8 * ref_step);
680dacca5f0SHans Verkuil 	}
681dacca5f0SHans Verkuil }
682dacca5f0SHans Verkuil 
683dacca5f0SHans Verkuil static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
684dacca5f0SHans Verkuil 			struct fwht_cframe *cf, u32 height, u32 width,
685dacca5f0SHans Verkuil 			u32 stride, unsigned int input_step,
686dacca5f0SHans Verkuil 			bool is_intra, bool next_is_intra)
687dacca5f0SHans Verkuil {
688dacca5f0SHans Verkuil 	u8 *input_start = input;
689dacca5f0SHans Verkuil 	__be16 *rlco_start = *rlco;
690dacca5f0SHans Verkuil 	s16 deltablock[64];
691dacca5f0SHans Verkuil 	__be16 pframe_bit = htons(PFRAME_BIT);
692dacca5f0SHans Verkuil 	u32 encoding = 0;
693dacca5f0SHans Verkuil 	unsigned int last_size = 0;
694dacca5f0SHans Verkuil 	unsigned int i, j;
695dacca5f0SHans Verkuil 
696dacca5f0SHans Verkuil 	width = round_up(width, 8);
697dacca5f0SHans Verkuil 	height = round_up(height, 8);
698dacca5f0SHans Verkuil 
699dacca5f0SHans Verkuil 	for (j = 0; j < height / 8; j++) {
700dacca5f0SHans Verkuil 		input = input_start + j * 8 * stride;
701dacca5f0SHans Verkuil 		for (i = 0; i < width / 8; i++) {
702dacca5f0SHans Verkuil 			/* intra code, first frame is always intra coded. */
703dacca5f0SHans Verkuil 			int blocktype = IBLOCK;
704dacca5f0SHans Verkuil 			unsigned int size;
705dacca5f0SHans Verkuil 
706dacca5f0SHans Verkuil 			if (!is_intra)
707dacca5f0SHans Verkuil 				blocktype = decide_blocktype(input, refp,
708dacca5f0SHans Verkuil 					deltablock, stride, input_step);
709dacca5f0SHans Verkuil 			if (blocktype == IBLOCK) {
710dacca5f0SHans Verkuil 				fwht(input, cf->coeffs, stride, input_step, 1);
711dacca5f0SHans Verkuil 				quantize_intra(cf->coeffs, cf->de_coeffs,
712dacca5f0SHans Verkuil 					       cf->i_frame_qp);
713dacca5f0SHans Verkuil 			} else {
714dacca5f0SHans Verkuil 				/* inter code */
715dacca5f0SHans Verkuil 				encoding |= FWHT_FRAME_PCODED;
716dacca5f0SHans Verkuil 				fwht16(deltablock, cf->coeffs, 8, 0);
717dacca5f0SHans Verkuil 				quantize_inter(cf->coeffs, cf->de_coeffs,
718dacca5f0SHans Verkuil 					       cf->p_frame_qp);
719dacca5f0SHans Verkuil 			}
720dacca5f0SHans Verkuil 			if (!next_is_intra) {
721dacca5f0SHans Verkuil 				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
722dacca5f0SHans Verkuil 
723dacca5f0SHans Verkuil 				if (blocktype == PBLOCK)
724dacca5f0SHans Verkuil 					add_deltas(cf->de_fwht, refp, 8, 1);
725dacca5f0SHans Verkuil 				fill_decoder_block(refp, cf->de_fwht, 8, 1);
726dacca5f0SHans Verkuil 			}
727dacca5f0SHans Verkuil 
728dacca5f0SHans Verkuil 			input += 8 * input_step;
729dacca5f0SHans Verkuil 			refp += 8 * 8;
730dacca5f0SHans Verkuil 
731dacca5f0SHans Verkuil 			size = rlc(cf->coeffs, *rlco, blocktype);
732dacca5f0SHans Verkuil 			if (last_size == size &&
733dacca5f0SHans Verkuil 			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
734dacca5f0SHans Verkuil 				__be16 *last_rlco = *rlco - size;
735dacca5f0SHans Verkuil 				s16 hdr = ntohs(*last_rlco);
736dacca5f0SHans Verkuil 
737dacca5f0SHans Verkuil 				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
738dacca5f0SHans Verkuil 				    (hdr & DUPS_MASK) < DUPS_MASK)
739dacca5f0SHans Verkuil 					*last_rlco = htons(hdr + 2);
740dacca5f0SHans Verkuil 				else
741dacca5f0SHans Verkuil 					*rlco += size;
742dacca5f0SHans Verkuil 			} else {
743dacca5f0SHans Verkuil 				*rlco += size;
744dacca5f0SHans Verkuil 			}
745dacca5f0SHans Verkuil 			if (*rlco >= rlco_max) {
746dacca5f0SHans Verkuil 				encoding |= FWHT_FRAME_UNENCODED;
747dacca5f0SHans Verkuil 				goto exit_loop;
748dacca5f0SHans Verkuil 			}
749dacca5f0SHans Verkuil 			last_size = size;
750dacca5f0SHans Verkuil 		}
751dacca5f0SHans Verkuil 	}
752dacca5f0SHans Verkuil 
753dacca5f0SHans Verkuil exit_loop:
754dacca5f0SHans Verkuil 	if (encoding & FWHT_FRAME_UNENCODED) {
755dacca5f0SHans Verkuil 		u8 *out = (u8 *)rlco_start;
756dacca5f0SHans Verkuil 		u8 *p;
757dacca5f0SHans Verkuil 
758dacca5f0SHans Verkuil 		input = input_start;
759dacca5f0SHans Verkuil 		/*
760dacca5f0SHans Verkuil 		 * The compressed stream should never contain the magic
761dacca5f0SHans Verkuil 		 * header, so when we copy the YUV data we replace 0xff
762dacca5f0SHans Verkuil 		 * by 0xfe. Since YUV is limited range such values
763dacca5f0SHans Verkuil 		 * shouldn't appear anyway.
764dacca5f0SHans Verkuil 		 */
765dacca5f0SHans Verkuil 		for (j = 0; j < height; j++) {
766dacca5f0SHans Verkuil 			for (i = 0, p = input; i < width; i++, p += input_step)
767dacca5f0SHans Verkuil 				*out++ = (*p == 0xff) ? 0xfe : *p;
768dacca5f0SHans Verkuil 			input += stride;
769dacca5f0SHans Verkuil 		}
770dacca5f0SHans Verkuil 		*rlco = (__be16 *)out;
771dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_PCODED;
772dacca5f0SHans Verkuil 	}
773dacca5f0SHans Verkuil 	return encoding;
774dacca5f0SHans Verkuil }
775dacca5f0SHans Verkuil 
776dacca5f0SHans Verkuil u32 fwht_encode_frame(struct fwht_raw_frame *frm,
777dacca5f0SHans Verkuil 		      struct fwht_raw_frame *ref_frm,
778dacca5f0SHans Verkuil 		      struct fwht_cframe *cf,
779dacca5f0SHans Verkuil 		      bool is_intra, bool next_is_intra,
780dacca5f0SHans Verkuil 		      unsigned int width, unsigned int height,
781dacca5f0SHans Verkuil 		      unsigned int stride, unsigned int chroma_stride)
782dacca5f0SHans Verkuil {
783dacca5f0SHans Verkuil 	unsigned int size = height * width;
784dacca5f0SHans Verkuil 	__be16 *rlco = cf->rlc_data;
785dacca5f0SHans Verkuil 	__be16 *rlco_max;
786dacca5f0SHans Verkuil 	u32 encoding;
787dacca5f0SHans Verkuil 
788dacca5f0SHans Verkuil 	rlco_max = rlco + size / 2 - 256;
789dacca5f0SHans Verkuil 	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
790dacca5f0SHans Verkuil 				height, width, stride,
791dacca5f0SHans Verkuil 				frm->luma_alpha_step, is_intra, next_is_intra);
792dacca5f0SHans Verkuil 	if (encoding & FWHT_FRAME_UNENCODED)
793dacca5f0SHans Verkuil 		encoding |= FWHT_LUMA_UNENCODED;
794dacca5f0SHans Verkuil 	encoding &= ~FWHT_FRAME_UNENCODED;
795dacca5f0SHans Verkuil 
796dacca5f0SHans Verkuil 	if (frm->components_num >= 3) {
797dacca5f0SHans Verkuil 		u32 chroma_h = height / frm->height_div;
798dacca5f0SHans Verkuil 		u32 chroma_w = width / frm->width_div;
799dacca5f0SHans Verkuil 		unsigned int chroma_size = chroma_h * chroma_w;
800dacca5f0SHans Verkuil 
801dacca5f0SHans Verkuil 		rlco_max = rlco + chroma_size / 2 - 256;
802dacca5f0SHans Verkuil 		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
803dacca5f0SHans Verkuil 					 cf, chroma_h, chroma_w,
804dacca5f0SHans Verkuil 					 chroma_stride, frm->chroma_step,
805dacca5f0SHans Verkuil 					 is_intra, next_is_intra);
806dacca5f0SHans Verkuil 		if (encoding & FWHT_FRAME_UNENCODED)
807dacca5f0SHans Verkuil 			encoding |= FWHT_CB_UNENCODED;
808dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_UNENCODED;
809dacca5f0SHans Verkuil 		rlco_max = rlco + chroma_size / 2 - 256;
810dacca5f0SHans Verkuil 		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
811dacca5f0SHans Verkuil 					 cf, chroma_h, chroma_w,
812dacca5f0SHans Verkuil 					 chroma_stride, frm->chroma_step,
813dacca5f0SHans Verkuil 					 is_intra, next_is_intra);
814dacca5f0SHans Verkuil 		if (encoding & FWHT_FRAME_UNENCODED)
815dacca5f0SHans Verkuil 			encoding |= FWHT_CR_UNENCODED;
816dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_UNENCODED;
817dacca5f0SHans Verkuil 	}
818dacca5f0SHans Verkuil 
819dacca5f0SHans Verkuil 	if (frm->components_num == 4) {
820dacca5f0SHans Verkuil 		rlco_max = rlco + size / 2 - 256;
821dacca5f0SHans Verkuil 		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
822dacca5f0SHans Verkuil 					 rlco_max, cf, height, width,
823dacca5f0SHans Verkuil 					 stride, frm->luma_alpha_step,
824dacca5f0SHans Verkuil 					 is_intra, next_is_intra);
825dacca5f0SHans Verkuil 		if (encoding & FWHT_FRAME_UNENCODED)
826dacca5f0SHans Verkuil 			encoding |= FWHT_ALPHA_UNENCODED;
827dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_UNENCODED;
828dacca5f0SHans Verkuil 	}
829dacca5f0SHans Verkuil 
830dacca5f0SHans Verkuil 	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
831dacca5f0SHans Verkuil 	return encoding;
832dacca5f0SHans Verkuil }
833dacca5f0SHans Verkuil 
834dacca5f0SHans Verkuil static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
835dacca5f0SHans Verkuil 			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
836dacca5f0SHans Verkuil 			 unsigned int ref_step, u8 *dst,
837dacca5f0SHans Verkuil 			 unsigned int dst_stride, unsigned int dst_step,
838dacca5f0SHans Verkuil 			 bool uncompressed, const __be16 *end_of_rlco_buf)
839dacca5f0SHans Verkuil {
840dacca5f0SHans Verkuil 	unsigned int copies = 0;
841dacca5f0SHans Verkuil 	s16 copy[8 * 8];
842dacca5f0SHans Verkuil 	u16 stat;
843dacca5f0SHans Verkuil 	unsigned int i, j;
844dacca5f0SHans Verkuil 	bool is_intra = !ref;
845dacca5f0SHans Verkuil 
846dacca5f0SHans Verkuil 	width = round_up(width, 8);
847dacca5f0SHans Verkuil 	height = round_up(height, 8);
848dacca5f0SHans Verkuil 
849dacca5f0SHans Verkuil 	if (uncompressed) {
850dacca5f0SHans Verkuil 		int i;
851dacca5f0SHans Verkuil 
852dacca5f0SHans Verkuil 		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
853dacca5f0SHans Verkuil 			return false;
854dacca5f0SHans Verkuil 		for (i = 0; i < height; i++) {
855dacca5f0SHans Verkuil 			memcpy(dst, *rlco, width);
856dacca5f0SHans Verkuil 			dst += dst_stride;
857dacca5f0SHans Verkuil 			*rlco += width / 2;
858dacca5f0SHans Verkuil 		}
859dacca5f0SHans Verkuil 		return true;
860dacca5f0SHans Verkuil 	}
861dacca5f0SHans Verkuil 
862dacca5f0SHans Verkuil 	/*
863dacca5f0SHans Verkuil 	 * When decoding each macroblock the rlco pointer will be increased
864dacca5f0SHans Verkuil 	 * by 65 * 2 bytes worst-case.
865dacca5f0SHans Verkuil 	 * To avoid overflow the buffer has to be 65/64th of the actual raw
866dacca5f0SHans Verkuil 	 * image size, just in case someone feeds it malicious data.
867dacca5f0SHans Verkuil 	 */
868dacca5f0SHans Verkuil 	for (j = 0; j < height / 8; j++) {
869dacca5f0SHans Verkuil 		for (i = 0; i < width / 8; i++) {
870dacca5f0SHans Verkuil 			const u8 *refp = ref + j * 8 * ref_stride +
871dacca5f0SHans Verkuil 				i * 8 * ref_step;
872dacca5f0SHans Verkuil 			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
873dacca5f0SHans Verkuil 
874dacca5f0SHans Verkuil 			if (copies) {
875dacca5f0SHans Verkuil 				memcpy(cf->de_fwht, copy, sizeof(copy));
876dacca5f0SHans Verkuil 				if ((stat & PFRAME_BIT) && !is_intra)
877dacca5f0SHans Verkuil 					add_deltas(cf->de_fwht, refp,
878dacca5f0SHans Verkuil 						   ref_stride, ref_step);
879dacca5f0SHans Verkuil 				fill_decoder_block(dstp, cf->de_fwht,
880dacca5f0SHans Verkuil 						   dst_stride, dst_step);
881dacca5f0SHans Verkuil 				copies--;
882dacca5f0SHans Verkuil 				continue;
883dacca5f0SHans Verkuil 			}
884dacca5f0SHans Verkuil 
885dacca5f0SHans Verkuil 			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
886dacca5f0SHans Verkuil 			if (stat & OVERFLOW_BIT)
887dacca5f0SHans Verkuil 				return false;
888dacca5f0SHans Verkuil 			if ((stat & PFRAME_BIT) && !is_intra)
889dacca5f0SHans Verkuil 				dequantize_inter(cf->coeffs);
890dacca5f0SHans Verkuil 			else
891dacca5f0SHans Verkuil 				dequantize_intra(cf->coeffs);
892dacca5f0SHans Verkuil 
893dacca5f0SHans Verkuil 			ifwht(cf->coeffs, cf->de_fwht,
894dacca5f0SHans Verkuil 			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
895dacca5f0SHans Verkuil 
896dacca5f0SHans Verkuil 			copies = (stat & DUPS_MASK) >> 1;
897dacca5f0SHans Verkuil 			if (copies)
898dacca5f0SHans Verkuil 				memcpy(copy, cf->de_fwht, sizeof(copy));
899dacca5f0SHans Verkuil 			if ((stat & PFRAME_BIT) && !is_intra)
900dacca5f0SHans Verkuil 				add_deltas(cf->de_fwht, refp,
901dacca5f0SHans Verkuil 					   ref_stride, ref_step);
902dacca5f0SHans Verkuil 			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
903dacca5f0SHans Verkuil 					   dst_step);
904dacca5f0SHans Verkuil 		}
905dacca5f0SHans Verkuil 	}
906dacca5f0SHans Verkuil 	return true;
907dacca5f0SHans Verkuil }
908dacca5f0SHans Verkuil 
909dacca5f0SHans Verkuil bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
910dacca5f0SHans Verkuil 		       unsigned int components_num, unsigned int width,
911dacca5f0SHans Verkuil 		       unsigned int height, const struct fwht_raw_frame *ref,
912dacca5f0SHans Verkuil 		       unsigned int ref_stride, unsigned int ref_chroma_stride,
913dacca5f0SHans Verkuil 		       struct fwht_raw_frame *dst, unsigned int dst_stride,
914dacca5f0SHans Verkuil 		       unsigned int dst_chroma_stride)
915dacca5f0SHans Verkuil {
916dacca5f0SHans Verkuil 	const __be16 *rlco = cf->rlc_data;
917dacca5f0SHans Verkuil 	const __be16 *end_of_rlco_buf = cf->rlc_data +
918dacca5f0SHans Verkuil 			(cf->size / sizeof(*rlco)) - 1;
919dacca5f0SHans Verkuil 
920dacca5f0SHans Verkuil 	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
921dacca5f0SHans Verkuil 			  ref->luma_alpha_step, dst->luma, dst_stride,
922dacca5f0SHans Verkuil 			  dst->luma_alpha_step,
923*3abfc314SHans Verkuil 			  hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
924dacca5f0SHans Verkuil 			  end_of_rlco_buf))
925dacca5f0SHans Verkuil 		return false;
926dacca5f0SHans Verkuil 
927dacca5f0SHans Verkuil 	if (components_num >= 3) {
928dacca5f0SHans Verkuil 		u32 h = height;
929dacca5f0SHans Verkuil 		u32 w = width;
930dacca5f0SHans Verkuil 
931*3abfc314SHans Verkuil 		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
932dacca5f0SHans Verkuil 			h /= 2;
933*3abfc314SHans Verkuil 		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
934dacca5f0SHans Verkuil 			w /= 2;
935dacca5f0SHans Verkuil 
936dacca5f0SHans Verkuil 		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
937dacca5f0SHans Verkuil 				  ref->chroma_step, dst->cb, dst_chroma_stride,
938dacca5f0SHans Verkuil 				  dst->chroma_step,
939*3abfc314SHans Verkuil 				  hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
940dacca5f0SHans Verkuil 				  end_of_rlco_buf))
941dacca5f0SHans Verkuil 			return false;
942dacca5f0SHans Verkuil 		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
943dacca5f0SHans Verkuil 				  ref->chroma_step, dst->cr, dst_chroma_stride,
944dacca5f0SHans Verkuil 				  dst->chroma_step,
945*3abfc314SHans Verkuil 				  hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
946dacca5f0SHans Verkuil 				  end_of_rlco_buf))
947dacca5f0SHans Verkuil 			return false;
948dacca5f0SHans Verkuil 	}
949dacca5f0SHans Verkuil 
950dacca5f0SHans Verkuil 	if (components_num == 4)
951dacca5f0SHans Verkuil 		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
952dacca5f0SHans Verkuil 				  ref->luma_alpha_step, dst->alpha, dst_stride,
953dacca5f0SHans Verkuil 				  dst->luma_alpha_step,
954*3abfc314SHans Verkuil 				  hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
955dacca5f0SHans Verkuil 				  end_of_rlco_buf))
956dacca5f0SHans Verkuil 			return false;
957dacca5f0SHans Verkuil 	return true;
958dacca5f0SHans Verkuil }
959