xref: /openbmc/linux/drivers/media/test-drivers/vicodec/codec-fwht.c (revision cdd38c5f1ce4398ec58fec95904b75824daab7b5)
1dacca5f0SHans Verkuil // SPDX-License-Identifier: LGPL-2.1+
2dacca5f0SHans Verkuil /*
3dacca5f0SHans Verkuil  * Copyright 2016 Tom aan de Wiel
4dacca5f0SHans Verkuil  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5dacca5f0SHans Verkuil  *
6dacca5f0SHans Verkuil  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7dacca5f0SHans Verkuil  *
8dacca5f0SHans Verkuil  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9dacca5f0SHans Verkuil  * R.D. Brown, 1977
10dacca5f0SHans Verkuil  */
11dacca5f0SHans Verkuil 
12dacca5f0SHans Verkuil #include <linux/string.h>
13dacca5f0SHans Verkuil #include <linux/kernel.h>
14*206bc0f6SHans Verkuil #include <linux/videodev2.h>
15dacca5f0SHans Verkuil #include "codec-fwht.h"
16dacca5f0SHans Verkuil 
17dacca5f0SHans Verkuil #define OVERFLOW_BIT BIT(14)
18dacca5f0SHans Verkuil 
19dacca5f0SHans Verkuil /*
20dacca5f0SHans Verkuil  * Note: bit 0 of the header must always be 0. Otherwise it cannot
21dacca5f0SHans Verkuil  * be guaranteed that the magic 8 byte sequence (see below) can
22dacca5f0SHans Verkuil  * never occur in the rlc output.
23dacca5f0SHans Verkuil  */
24dacca5f0SHans Verkuil #define PFRAME_BIT BIT(15)
25dacca5f0SHans Verkuil #define DUPS_MASK 0x1ffe
26dacca5f0SHans Verkuil 
27dacca5f0SHans Verkuil #define PBLOCK 0
28dacca5f0SHans Verkuil #define IBLOCK 1
29dacca5f0SHans Verkuil 
30dacca5f0SHans Verkuil #define ALL_ZEROS 15
31dacca5f0SHans Verkuil 
32dacca5f0SHans Verkuil static const uint8_t zigzag[64] = {
33dacca5f0SHans Verkuil 	0,
34dacca5f0SHans Verkuil 	1,  8,
35dacca5f0SHans Verkuil 	2,  9, 16,
36dacca5f0SHans Verkuil 	3, 10, 17, 24,
37dacca5f0SHans Verkuil 	4, 11, 18, 25, 32,
38dacca5f0SHans Verkuil 	5, 12, 19, 26, 33, 40,
39dacca5f0SHans Verkuil 	6, 13, 20, 27, 34, 41, 48,
40dacca5f0SHans Verkuil 	7, 14, 21, 28, 35, 42, 49, 56,
41dacca5f0SHans Verkuil 	15, 22, 29, 36, 43, 50, 57,
42dacca5f0SHans Verkuil 	23, 30, 37, 44, 51, 58,
43dacca5f0SHans Verkuil 	31, 38, 45, 52, 59,
44dacca5f0SHans Verkuil 	39, 46, 53, 60,
45dacca5f0SHans Verkuil 	47, 54, 61,
46dacca5f0SHans Verkuil 	55, 62,
47dacca5f0SHans Verkuil 	63,
48dacca5f0SHans Verkuil };
49dacca5f0SHans Verkuil 
50dacca5f0SHans Verkuil /*
51dacca5f0SHans Verkuil  * noinline_for_stack to work around
52dacca5f0SHans Verkuil  * https://bugs.llvm.org/show_bug.cgi?id=38809
53dacca5f0SHans Verkuil  */
54dacca5f0SHans Verkuil static int noinline_for_stack
rlc(const s16 * in,__be16 * output,int blocktype)55dacca5f0SHans Verkuil rlc(const s16 *in, __be16 *output, int blocktype)
56dacca5f0SHans Verkuil {
57dacca5f0SHans Verkuil 	s16 block[8 * 8];
58dacca5f0SHans Verkuil 	s16 *wp = block;
59dacca5f0SHans Verkuil 	int i = 0;
60dacca5f0SHans Verkuil 	int x, y;
61dacca5f0SHans Verkuil 	int ret = 0;
62dacca5f0SHans Verkuil 
63dacca5f0SHans Verkuil 	/* read in block from framebuffer */
64dacca5f0SHans Verkuil 	int lastzero_run = 0;
65dacca5f0SHans Verkuil 	int to_encode;
66dacca5f0SHans Verkuil 
67dacca5f0SHans Verkuil 	for (y = 0; y < 8; y++) {
68dacca5f0SHans Verkuil 		for (x = 0; x < 8; x++) {
69dacca5f0SHans Verkuil 			*wp = in[x + y * 8];
70dacca5f0SHans Verkuil 			wp++;
71dacca5f0SHans Verkuil 		}
72dacca5f0SHans Verkuil 	}
73dacca5f0SHans Verkuil 
74dacca5f0SHans Verkuil 	/* keep track of amount of trailing zeros */
75dacca5f0SHans Verkuil 	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
76dacca5f0SHans Verkuil 		lastzero_run++;
77dacca5f0SHans Verkuil 
78dacca5f0SHans Verkuil 	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
79dacca5f0SHans Verkuil 	ret++;
80dacca5f0SHans Verkuil 
81dacca5f0SHans Verkuil 	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
82dacca5f0SHans Verkuil 
83dacca5f0SHans Verkuil 	i = 0;
84dacca5f0SHans Verkuil 	while (i < to_encode) {
85dacca5f0SHans Verkuil 		int cnt = 0;
86dacca5f0SHans Verkuil 		int tmp;
87dacca5f0SHans Verkuil 
88dacca5f0SHans Verkuil 		/* count leading zeros */
89dacca5f0SHans Verkuil 		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
90dacca5f0SHans Verkuil 			cnt++;
91dacca5f0SHans Verkuil 			i++;
92dacca5f0SHans Verkuil 			if (i == to_encode) {
93dacca5f0SHans Verkuil 				cnt--;
94dacca5f0SHans Verkuil 				break;
95dacca5f0SHans Verkuil 			}
96dacca5f0SHans Verkuil 		}
97dacca5f0SHans Verkuil 		/* 4 bits for run, 12 for coefficient (quantization by 4) */
98dacca5f0SHans Verkuil 		*output++ = htons((cnt | tmp << 4));
99dacca5f0SHans Verkuil 		i++;
100dacca5f0SHans Verkuil 		ret++;
101dacca5f0SHans Verkuil 	}
102dacca5f0SHans Verkuil 	if (lastzero_run > 14) {
103dacca5f0SHans Verkuil 		*output = htons(ALL_ZEROS | 0);
104dacca5f0SHans Verkuil 		ret++;
105dacca5f0SHans Verkuil 	}
106dacca5f0SHans Verkuil 
107dacca5f0SHans Verkuil 	return ret;
108dacca5f0SHans Verkuil }
109dacca5f0SHans Verkuil 
110dacca5f0SHans Verkuil /*
111dacca5f0SHans Verkuil  * This function will worst-case increase rlc_in by 65*2 bytes:
112dacca5f0SHans Verkuil  * one s16 value for the header and 8 * 8 coefficients of type s16.
113dacca5f0SHans Verkuil  */
114dacca5f0SHans Verkuil static noinline_for_stack u16
derlc(const __be16 ** rlc_in,s16 * dwht_out,const __be16 * end_of_input)115dacca5f0SHans Verkuil derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
116dacca5f0SHans Verkuil {
117dacca5f0SHans Verkuil 	/* header */
118dacca5f0SHans Verkuil 	const __be16 *input = *rlc_in;
119dacca5f0SHans Verkuil 	u16 stat;
120dacca5f0SHans Verkuil 	int dec_count = 0;
121dacca5f0SHans Verkuil 	s16 block[8 * 8 + 16];
122dacca5f0SHans Verkuil 	s16 *wp = block;
123dacca5f0SHans Verkuil 	int i;
124dacca5f0SHans Verkuil 
125dacca5f0SHans Verkuil 	if (input > end_of_input)
126dacca5f0SHans Verkuil 		return OVERFLOW_BIT;
127dacca5f0SHans Verkuil 	stat = ntohs(*input++);
128dacca5f0SHans Verkuil 
129dacca5f0SHans Verkuil 	/*
130dacca5f0SHans Verkuil 	 * Now de-compress, it expands one byte to up to 15 bytes
131dacca5f0SHans Verkuil 	 * (or fills the remainder of the 64 bytes with zeroes if it
132dacca5f0SHans Verkuil 	 * is the last byte to expand).
133dacca5f0SHans Verkuil 	 *
134dacca5f0SHans Verkuil 	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
135dacca5f0SHans Verkuil 	 * allow for overflow if the incoming data was malformed.
136dacca5f0SHans Verkuil 	 */
137dacca5f0SHans Verkuil 	while (dec_count < 8 * 8) {
138dacca5f0SHans Verkuil 		s16 in;
139dacca5f0SHans Verkuil 		int length;
140dacca5f0SHans Verkuil 		int coeff;
141dacca5f0SHans Verkuil 
142dacca5f0SHans Verkuil 		if (input > end_of_input)
143dacca5f0SHans Verkuil 			return OVERFLOW_BIT;
144dacca5f0SHans Verkuil 		in = ntohs(*input++);
145dacca5f0SHans Verkuil 		length = in & 0xf;
146dacca5f0SHans Verkuil 		coeff = in >> 4;
147dacca5f0SHans Verkuil 
148dacca5f0SHans Verkuil 		/* fill remainder with zeros */
149dacca5f0SHans Verkuil 		if (length == 15) {
150dacca5f0SHans Verkuil 			for (i = 0; i < 64 - dec_count; i++)
151dacca5f0SHans Verkuil 				*wp++ = 0;
152dacca5f0SHans Verkuil 			break;
153dacca5f0SHans Verkuil 		}
154dacca5f0SHans Verkuil 
155dacca5f0SHans Verkuil 		for (i = 0; i < length; i++)
156dacca5f0SHans Verkuil 			*wp++ = 0;
157dacca5f0SHans Verkuil 		*wp++ = coeff;
158dacca5f0SHans Verkuil 		dec_count += length + 1;
159dacca5f0SHans Verkuil 	}
160dacca5f0SHans Verkuil 
161dacca5f0SHans Verkuil 	wp = block;
162dacca5f0SHans Verkuil 
163dacca5f0SHans Verkuil 	for (i = 0; i < 64; i++) {
164dacca5f0SHans Verkuil 		int pos = zigzag[i];
165dacca5f0SHans Verkuil 		int y = pos / 8;
166dacca5f0SHans Verkuil 		int x = pos % 8;
167dacca5f0SHans Verkuil 
168dacca5f0SHans Verkuil 		dwht_out[x + y * 8] = *wp++;
169dacca5f0SHans Verkuil 	}
170dacca5f0SHans Verkuil 	*rlc_in = input;
171dacca5f0SHans Verkuil 	return stat;
172dacca5f0SHans Verkuil }
173dacca5f0SHans Verkuil 
174dacca5f0SHans Verkuil static const int quant_table[] = {
175dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  2,  2,
176dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  2,  2,
177dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  2,  3,
178dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 2,  3,  6,
179dacca5f0SHans Verkuil 	2, 2, 2, 2, 2, 3,  6,  6,
180dacca5f0SHans Verkuil 	2, 2, 2, 2, 3, 6,  6,  6,
181dacca5f0SHans Verkuil 	2, 2, 2, 3, 6, 6,  6,  6,
182dacca5f0SHans Verkuil 	2, 2, 3, 6, 6, 6,  6,  8,
183dacca5f0SHans Verkuil };
184dacca5f0SHans Verkuil 
185dacca5f0SHans Verkuil static const int quant_table_p[] = {
186dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  3,
187dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  3,
188dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  3,
189dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  3,  6,
190dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 3,  6,  6,
191dacca5f0SHans Verkuil 	3, 3, 3, 3, 3, 6,  6,  9,
192dacca5f0SHans Verkuil 	3, 3, 3, 3, 6, 6,  9,  9,
193dacca5f0SHans Verkuil 	3, 3, 3, 6, 6, 9,  9,  10,
194dacca5f0SHans Verkuil };
195dacca5f0SHans Verkuil 
quantize_intra(s16 * coeff,s16 * de_coeff,u16 qp)196dacca5f0SHans Verkuil static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
197dacca5f0SHans Verkuil {
198dacca5f0SHans Verkuil 	const int *quant = quant_table;
199dacca5f0SHans Verkuil 	int i, j;
200dacca5f0SHans Verkuil 
201dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++) {
202dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
203dacca5f0SHans Verkuil 			*coeff >>= *quant;
204dacca5f0SHans Verkuil 			if (*coeff >= -qp && *coeff <= qp)
205dacca5f0SHans Verkuil 				*coeff = *de_coeff = 0;
206dacca5f0SHans Verkuil 			else
207dacca5f0SHans Verkuil 				*de_coeff = *coeff << *quant;
208dacca5f0SHans Verkuil 		}
209dacca5f0SHans Verkuil 	}
210dacca5f0SHans Verkuil }
211dacca5f0SHans Verkuil 
dequantize_intra(s16 * coeff)212dacca5f0SHans Verkuil static void dequantize_intra(s16 *coeff)
213dacca5f0SHans Verkuil {
214dacca5f0SHans Verkuil 	const int *quant = quant_table;
215dacca5f0SHans Verkuil 	int i, j;
216dacca5f0SHans Verkuil 
217dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++)
218dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++)
219dacca5f0SHans Verkuil 			*coeff <<= *quant;
220dacca5f0SHans Verkuil }
221dacca5f0SHans Verkuil 
quantize_inter(s16 * coeff,s16 * de_coeff,u16 qp)222dacca5f0SHans Verkuil static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
223dacca5f0SHans Verkuil {
224dacca5f0SHans Verkuil 	const int *quant = quant_table_p;
225dacca5f0SHans Verkuil 	int i, j;
226dacca5f0SHans Verkuil 
227dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++) {
228dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
229dacca5f0SHans Verkuil 			*coeff >>= *quant;
230dacca5f0SHans Verkuil 			if (*coeff >= -qp && *coeff <= qp)
231dacca5f0SHans Verkuil 				*coeff = *de_coeff = 0;
232dacca5f0SHans Verkuil 			else
233dacca5f0SHans Verkuil 				*de_coeff = *coeff << *quant;
234dacca5f0SHans Verkuil 		}
235dacca5f0SHans Verkuil 	}
236dacca5f0SHans Verkuil }
237dacca5f0SHans Verkuil 
dequantize_inter(s16 * coeff)238dacca5f0SHans Verkuil static void dequantize_inter(s16 *coeff)
239dacca5f0SHans Verkuil {
240dacca5f0SHans Verkuil 	const int *quant = quant_table_p;
241dacca5f0SHans Verkuil 	int i, j;
242dacca5f0SHans Verkuil 
243dacca5f0SHans Verkuil 	for (j = 0; j < 8; j++)
244dacca5f0SHans Verkuil 		for (i = 0; i < 8; i++, quant++, coeff++)
245dacca5f0SHans Verkuil 			*coeff <<= *quant;
246dacca5f0SHans Verkuil }
247dacca5f0SHans Verkuil 
fwht(const u8 * block,s16 * output_block,unsigned int stride,unsigned int input_step,bool intra)248dacca5f0SHans Verkuil static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
249dacca5f0SHans Verkuil 				    unsigned int stride,
250dacca5f0SHans Verkuil 				    unsigned int input_step, bool intra)
251dacca5f0SHans Verkuil {
252dacca5f0SHans Verkuil 	/* we'll need more than 8 bits for the transformed coefficients */
253dacca5f0SHans Verkuil 	s32 workspace1[8], workspace2[8];
254dacca5f0SHans Verkuil 	const u8 *tmp = block;
255dacca5f0SHans Verkuil 	s16 *out = output_block;
256dacca5f0SHans Verkuil 	int add = intra ? 256 : 0;
257dacca5f0SHans Verkuil 	unsigned int i;
258dacca5f0SHans Verkuil 
259dacca5f0SHans Verkuil 	/* stage 1 */
260dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
261dacca5f0SHans Verkuil 		switch (input_step) {
262dacca5f0SHans Verkuil 		case 1:
263dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[1] - add;
264dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[1];
265dacca5f0SHans Verkuil 
266dacca5f0SHans Verkuil 			workspace1[2]  = tmp[2] + tmp[3] - add;
267dacca5f0SHans Verkuil 			workspace1[3]  = tmp[2] - tmp[3];
268dacca5f0SHans Verkuil 
269dacca5f0SHans Verkuil 			workspace1[4]  = tmp[4] + tmp[5] - add;
270dacca5f0SHans Verkuil 			workspace1[5]  = tmp[4] - tmp[5];
271dacca5f0SHans Verkuil 
272dacca5f0SHans Verkuil 			workspace1[6]  = tmp[6] + tmp[7] - add;
273dacca5f0SHans Verkuil 			workspace1[7]  = tmp[6] - tmp[7];
274dacca5f0SHans Verkuil 			break;
275dacca5f0SHans Verkuil 		case 2:
276dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[2] - add;
277dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[2];
278dacca5f0SHans Verkuil 
279dacca5f0SHans Verkuil 			workspace1[2]  = tmp[4] + tmp[6] - add;
280dacca5f0SHans Verkuil 			workspace1[3]  = tmp[4] - tmp[6];
281dacca5f0SHans Verkuil 
282dacca5f0SHans Verkuil 			workspace1[4]  = tmp[8] + tmp[10] - add;
283dacca5f0SHans Verkuil 			workspace1[5]  = tmp[8] - tmp[10];
284dacca5f0SHans Verkuil 
285dacca5f0SHans Verkuil 			workspace1[6]  = tmp[12] + tmp[14] - add;
286dacca5f0SHans Verkuil 			workspace1[7]  = tmp[12] - tmp[14];
287dacca5f0SHans Verkuil 			break;
288dacca5f0SHans Verkuil 		case 3:
289dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[3] - add;
290dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[3];
291dacca5f0SHans Verkuil 
292dacca5f0SHans Verkuil 			workspace1[2]  = tmp[6] + tmp[9] - add;
293dacca5f0SHans Verkuil 			workspace1[3]  = tmp[6] - tmp[9];
294dacca5f0SHans Verkuil 
295dacca5f0SHans Verkuil 			workspace1[4]  = tmp[12] + tmp[15] - add;
296dacca5f0SHans Verkuil 			workspace1[5]  = tmp[12] - tmp[15];
297dacca5f0SHans Verkuil 
298dacca5f0SHans Verkuil 			workspace1[6]  = tmp[18] + tmp[21] - add;
299dacca5f0SHans Verkuil 			workspace1[7]  = tmp[18] - tmp[21];
300dacca5f0SHans Verkuil 			break;
301dacca5f0SHans Verkuil 		default:
302dacca5f0SHans Verkuil 			workspace1[0]  = tmp[0] + tmp[4] - add;
303dacca5f0SHans Verkuil 			workspace1[1]  = tmp[0] - tmp[4];
304dacca5f0SHans Verkuil 
305dacca5f0SHans Verkuil 			workspace1[2]  = tmp[8] + tmp[12] - add;
306dacca5f0SHans Verkuil 			workspace1[3]  = tmp[8] - tmp[12];
307dacca5f0SHans Verkuil 
308dacca5f0SHans Verkuil 			workspace1[4]  = tmp[16] + tmp[20] - add;
309dacca5f0SHans Verkuil 			workspace1[5]  = tmp[16] - tmp[20];
310dacca5f0SHans Verkuil 
311dacca5f0SHans Verkuil 			workspace1[6]  = tmp[24] + tmp[28] - add;
312dacca5f0SHans Verkuil 			workspace1[7]  = tmp[24] - tmp[28];
313dacca5f0SHans Verkuil 			break;
314dacca5f0SHans Verkuil 		}
315dacca5f0SHans Verkuil 
316dacca5f0SHans Verkuil 		/* stage 2 */
317dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
318dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
319dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
320dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
321dacca5f0SHans Verkuil 
322dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
323dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
324dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
325dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
326dacca5f0SHans Verkuil 
327dacca5f0SHans Verkuil 		/* stage 3 */
328dacca5f0SHans Verkuil 		out[0] = workspace2[0] + workspace2[4];
329dacca5f0SHans Verkuil 		out[1] = workspace2[0] - workspace2[4];
330dacca5f0SHans Verkuil 		out[2] = workspace2[1] - workspace2[5];
331dacca5f0SHans Verkuil 		out[3] = workspace2[1] + workspace2[5];
332dacca5f0SHans Verkuil 		out[4] = workspace2[2] + workspace2[6];
333dacca5f0SHans Verkuil 		out[5] = workspace2[2] - workspace2[6];
334dacca5f0SHans Verkuil 		out[6] = workspace2[3] - workspace2[7];
335dacca5f0SHans Verkuil 		out[7] = workspace2[3] + workspace2[7];
336dacca5f0SHans Verkuil 	}
337dacca5f0SHans Verkuil 
338dacca5f0SHans Verkuil 	out = output_block;
339dacca5f0SHans Verkuil 
340dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, out++) {
341dacca5f0SHans Verkuil 		/* stage 1 */
342dacca5f0SHans Verkuil 		workspace1[0]  = out[0] + out[1 * 8];
343dacca5f0SHans Verkuil 		workspace1[1]  = out[0] - out[1 * 8];
344dacca5f0SHans Verkuil 
345dacca5f0SHans Verkuil 		workspace1[2]  = out[2 * 8] + out[3 * 8];
346dacca5f0SHans Verkuil 		workspace1[3]  = out[2 * 8] - out[3 * 8];
347dacca5f0SHans Verkuil 
348dacca5f0SHans Verkuil 		workspace1[4]  = out[4 * 8] + out[5 * 8];
349dacca5f0SHans Verkuil 		workspace1[5]  = out[4 * 8] - out[5 * 8];
350dacca5f0SHans Verkuil 
351dacca5f0SHans Verkuil 		workspace1[6]  = out[6 * 8] + out[7 * 8];
352dacca5f0SHans Verkuil 		workspace1[7]  = out[6 * 8] - out[7 * 8];
353dacca5f0SHans Verkuil 
354dacca5f0SHans Verkuil 		/* stage 2 */
355dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
356dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
357dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
358dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
359dacca5f0SHans Verkuil 
360dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
361dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
362dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
363dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
364dacca5f0SHans Verkuil 		/* stage 3 */
365dacca5f0SHans Verkuil 		out[0 * 8] = workspace2[0] + workspace2[4];
366dacca5f0SHans Verkuil 		out[1 * 8] = workspace2[0] - workspace2[4];
367dacca5f0SHans Verkuil 		out[2 * 8] = workspace2[1] - workspace2[5];
368dacca5f0SHans Verkuil 		out[3 * 8] = workspace2[1] + workspace2[5];
369dacca5f0SHans Verkuil 		out[4 * 8] = workspace2[2] + workspace2[6];
370dacca5f0SHans Verkuil 		out[5 * 8] = workspace2[2] - workspace2[6];
371dacca5f0SHans Verkuil 		out[6 * 8] = workspace2[3] - workspace2[7];
372dacca5f0SHans Verkuil 		out[7 * 8] = workspace2[3] + workspace2[7];
373dacca5f0SHans Verkuil 	}
374dacca5f0SHans Verkuil }
375dacca5f0SHans Verkuil 
376dacca5f0SHans Verkuil /*
377dacca5f0SHans Verkuil  * Not the nicest way of doing it, but P-blocks get twice the range of
378dacca5f0SHans Verkuil  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
379dacca5f0SHans Verkuil  * Furthermore values can be negative... This is just a version that
380dacca5f0SHans Verkuil  * works with 16 signed data
381dacca5f0SHans Verkuil  */
382dacca5f0SHans Verkuil static void noinline_for_stack
fwht16(const s16 * block,s16 * output_block,int stride,int intra)383dacca5f0SHans Verkuil fwht16(const s16 *block, s16 *output_block, int stride, int intra)
384dacca5f0SHans Verkuil {
385dacca5f0SHans Verkuil 	/* we'll need more than 8 bits for the transformed coefficients */
386dacca5f0SHans Verkuil 	s32 workspace1[8], workspace2[8];
387dacca5f0SHans Verkuil 	const s16 *tmp = block;
388dacca5f0SHans Verkuil 	s16 *out = output_block;
389dacca5f0SHans Verkuil 	int i;
390dacca5f0SHans Verkuil 
391dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
392dacca5f0SHans Verkuil 		/* stage 1 */
393dacca5f0SHans Verkuil 		workspace1[0]  = tmp[0] + tmp[1];
394dacca5f0SHans Verkuil 		workspace1[1]  = tmp[0] - tmp[1];
395dacca5f0SHans Verkuil 
396dacca5f0SHans Verkuil 		workspace1[2]  = tmp[2] + tmp[3];
397dacca5f0SHans Verkuil 		workspace1[3]  = tmp[2] - tmp[3];
398dacca5f0SHans Verkuil 
399dacca5f0SHans Verkuil 		workspace1[4]  = tmp[4] + tmp[5];
400dacca5f0SHans Verkuil 		workspace1[5]  = tmp[4] - tmp[5];
401dacca5f0SHans Verkuil 
402dacca5f0SHans Verkuil 		workspace1[6]  = tmp[6] + tmp[7];
403dacca5f0SHans Verkuil 		workspace1[7]  = tmp[6] - tmp[7];
404dacca5f0SHans Verkuil 
405dacca5f0SHans Verkuil 		/* stage 2 */
406dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
407dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
408dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
409dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
410dacca5f0SHans Verkuil 
411dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
412dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
413dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
414dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
415dacca5f0SHans Verkuil 
416dacca5f0SHans Verkuil 		/* stage 3 */
417dacca5f0SHans Verkuil 		out[0] = workspace2[0] + workspace2[4];
418dacca5f0SHans Verkuil 		out[1] = workspace2[0] - workspace2[4];
419dacca5f0SHans Verkuil 		out[2] = workspace2[1] - workspace2[5];
420dacca5f0SHans Verkuil 		out[3] = workspace2[1] + workspace2[5];
421dacca5f0SHans Verkuil 		out[4] = workspace2[2] + workspace2[6];
422dacca5f0SHans Verkuil 		out[5] = workspace2[2] - workspace2[6];
423dacca5f0SHans Verkuil 		out[6] = workspace2[3] - workspace2[7];
424dacca5f0SHans Verkuil 		out[7] = workspace2[3] + workspace2[7];
425dacca5f0SHans Verkuil 	}
426dacca5f0SHans Verkuil 
427dacca5f0SHans Verkuil 	out = output_block;
428dacca5f0SHans Verkuil 
429dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, out++) {
430dacca5f0SHans Verkuil 		/* stage 1 */
431dacca5f0SHans Verkuil 		workspace1[0]  = out[0] + out[1*8];
432dacca5f0SHans Verkuil 		workspace1[1]  = out[0] - out[1*8];
433dacca5f0SHans Verkuil 
434dacca5f0SHans Verkuil 		workspace1[2]  = out[2*8] + out[3*8];
435dacca5f0SHans Verkuil 		workspace1[3]  = out[2*8] - out[3*8];
436dacca5f0SHans Verkuil 
437dacca5f0SHans Verkuil 		workspace1[4]  = out[4*8] + out[5*8];
438dacca5f0SHans Verkuil 		workspace1[5]  = out[4*8] - out[5*8];
439dacca5f0SHans Verkuil 
440dacca5f0SHans Verkuil 		workspace1[6]  = out[6*8] + out[7*8];
441dacca5f0SHans Verkuil 		workspace1[7]  = out[6*8] - out[7*8];
442dacca5f0SHans Verkuil 
443dacca5f0SHans Verkuil 		/* stage 2 */
444dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
445dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
446dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
447dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
448dacca5f0SHans Verkuil 
449dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
450dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
451dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
452dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
453dacca5f0SHans Verkuil 
454dacca5f0SHans Verkuil 		/* stage 3 */
455dacca5f0SHans Verkuil 		out[0*8] = workspace2[0] + workspace2[4];
456dacca5f0SHans Verkuil 		out[1*8] = workspace2[0] - workspace2[4];
457dacca5f0SHans Verkuil 		out[2*8] = workspace2[1] - workspace2[5];
458dacca5f0SHans Verkuil 		out[3*8] = workspace2[1] + workspace2[5];
459dacca5f0SHans Verkuil 		out[4*8] = workspace2[2] + workspace2[6];
460dacca5f0SHans Verkuil 		out[5*8] = workspace2[2] - workspace2[6];
461dacca5f0SHans Verkuil 		out[6*8] = workspace2[3] - workspace2[7];
462dacca5f0SHans Verkuil 		out[7*8] = workspace2[3] + workspace2[7];
463dacca5f0SHans Verkuil 	}
464dacca5f0SHans Verkuil }
465dacca5f0SHans Verkuil 
466dacca5f0SHans Verkuil static noinline_for_stack void
ifwht(const s16 * block,s16 * output_block,int intra)467dacca5f0SHans Verkuil ifwht(const s16 *block, s16 *output_block, int intra)
468dacca5f0SHans Verkuil {
469dacca5f0SHans Verkuil 	/*
470dacca5f0SHans Verkuil 	 * we'll need more than 8 bits for the transformed coefficients
471dacca5f0SHans Verkuil 	 * use native unit of cpu
472dacca5f0SHans Verkuil 	 */
473dacca5f0SHans Verkuil 	int workspace1[8], workspace2[8];
474dacca5f0SHans Verkuil 	int inter = intra ? 0 : 1;
475dacca5f0SHans Verkuil 	const s16 *tmp = block;
476dacca5f0SHans Verkuil 	s16 *out = output_block;
477dacca5f0SHans Verkuil 	int i;
478dacca5f0SHans Verkuil 
479dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
480dacca5f0SHans Verkuil 		/* stage 1 */
481dacca5f0SHans Verkuil 		workspace1[0]  = tmp[0] + tmp[1];
482dacca5f0SHans Verkuil 		workspace1[1]  = tmp[0] - tmp[1];
483dacca5f0SHans Verkuil 
484dacca5f0SHans Verkuil 		workspace1[2]  = tmp[2] + tmp[3];
485dacca5f0SHans Verkuil 		workspace1[3]  = tmp[2] - tmp[3];
486dacca5f0SHans Verkuil 
487dacca5f0SHans Verkuil 		workspace1[4]  = tmp[4] + tmp[5];
488dacca5f0SHans Verkuil 		workspace1[5]  = tmp[4] - tmp[5];
489dacca5f0SHans Verkuil 
490dacca5f0SHans Verkuil 		workspace1[6]  = tmp[6] + tmp[7];
491dacca5f0SHans Verkuil 		workspace1[7]  = tmp[6] - tmp[7];
492dacca5f0SHans Verkuil 
493dacca5f0SHans Verkuil 		/* stage 2 */
494dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
495dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
496dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
497dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
498dacca5f0SHans Verkuil 
499dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
500dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
501dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
502dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
503dacca5f0SHans Verkuil 
504dacca5f0SHans Verkuil 		/* stage 3 */
505dacca5f0SHans Verkuil 		out[0] = workspace2[0] + workspace2[4];
506dacca5f0SHans Verkuil 		out[1] = workspace2[0] - workspace2[4];
507dacca5f0SHans Verkuil 		out[2] = workspace2[1] - workspace2[5];
508dacca5f0SHans Verkuil 		out[3] = workspace2[1] + workspace2[5];
509dacca5f0SHans Verkuil 		out[4] = workspace2[2] + workspace2[6];
510dacca5f0SHans Verkuil 		out[5] = workspace2[2] - workspace2[6];
511dacca5f0SHans Verkuil 		out[6] = workspace2[3] - workspace2[7];
512dacca5f0SHans Verkuil 		out[7] = workspace2[3] + workspace2[7];
513dacca5f0SHans Verkuil 	}
514dacca5f0SHans Verkuil 
515dacca5f0SHans Verkuil 	out = output_block;
516dacca5f0SHans Verkuil 
517dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++, out++) {
518dacca5f0SHans Verkuil 		/* stage 1 */
519dacca5f0SHans Verkuil 		workspace1[0]  = out[0] + out[1 * 8];
520dacca5f0SHans Verkuil 		workspace1[1]  = out[0] - out[1 * 8];
521dacca5f0SHans Verkuil 
522dacca5f0SHans Verkuil 		workspace1[2]  = out[2 * 8] + out[3 * 8];
523dacca5f0SHans Verkuil 		workspace1[3]  = out[2 * 8] - out[3 * 8];
524dacca5f0SHans Verkuil 
525dacca5f0SHans Verkuil 		workspace1[4]  = out[4 * 8] + out[5 * 8];
526dacca5f0SHans Verkuil 		workspace1[5]  = out[4 * 8] - out[5 * 8];
527dacca5f0SHans Verkuil 
528dacca5f0SHans Verkuil 		workspace1[6]  = out[6 * 8] + out[7 * 8];
529dacca5f0SHans Verkuil 		workspace1[7]  = out[6 * 8] - out[7 * 8];
530dacca5f0SHans Verkuil 
531dacca5f0SHans Verkuil 		/* stage 2 */
532dacca5f0SHans Verkuil 		workspace2[0] = workspace1[0] + workspace1[2];
533dacca5f0SHans Verkuil 		workspace2[1] = workspace1[0] - workspace1[2];
534dacca5f0SHans Verkuil 		workspace2[2] = workspace1[1] - workspace1[3];
535dacca5f0SHans Verkuil 		workspace2[3] = workspace1[1] + workspace1[3];
536dacca5f0SHans Verkuil 
537dacca5f0SHans Verkuil 		workspace2[4] = workspace1[4] + workspace1[6];
538dacca5f0SHans Verkuil 		workspace2[5] = workspace1[4] - workspace1[6];
539dacca5f0SHans Verkuil 		workspace2[6] = workspace1[5] - workspace1[7];
540dacca5f0SHans Verkuil 		workspace2[7] = workspace1[5] + workspace1[7];
541dacca5f0SHans Verkuil 
542dacca5f0SHans Verkuil 		/* stage 3 */
543dacca5f0SHans Verkuil 		if (inter) {
544dacca5f0SHans Verkuil 			int d;
545dacca5f0SHans Verkuil 
546dacca5f0SHans Verkuil 			out[0 * 8] = workspace2[0] + workspace2[4];
547dacca5f0SHans Verkuil 			out[1 * 8] = workspace2[0] - workspace2[4];
548dacca5f0SHans Verkuil 			out[2 * 8] = workspace2[1] - workspace2[5];
549dacca5f0SHans Verkuil 			out[3 * 8] = workspace2[1] + workspace2[5];
550dacca5f0SHans Verkuil 			out[4 * 8] = workspace2[2] + workspace2[6];
551dacca5f0SHans Verkuil 			out[5 * 8] = workspace2[2] - workspace2[6];
552dacca5f0SHans Verkuil 			out[6 * 8] = workspace2[3] - workspace2[7];
553dacca5f0SHans Verkuil 			out[7 * 8] = workspace2[3] + workspace2[7];
554dacca5f0SHans Verkuil 
555dacca5f0SHans Verkuil 			for (d = 0; d < 8; d++)
556dacca5f0SHans Verkuil 				out[8 * d] >>= 6;
557dacca5f0SHans Verkuil 		} else {
558dacca5f0SHans Verkuil 			int d;
559dacca5f0SHans Verkuil 
560dacca5f0SHans Verkuil 			out[0 * 8] = workspace2[0] + workspace2[4];
561dacca5f0SHans Verkuil 			out[1 * 8] = workspace2[0] - workspace2[4];
562dacca5f0SHans Verkuil 			out[2 * 8] = workspace2[1] - workspace2[5];
563dacca5f0SHans Verkuil 			out[3 * 8] = workspace2[1] + workspace2[5];
564dacca5f0SHans Verkuil 			out[4 * 8] = workspace2[2] + workspace2[6];
565dacca5f0SHans Verkuil 			out[5 * 8] = workspace2[2] - workspace2[6];
566dacca5f0SHans Verkuil 			out[6 * 8] = workspace2[3] - workspace2[7];
567dacca5f0SHans Verkuil 			out[7 * 8] = workspace2[3] + workspace2[7];
568dacca5f0SHans Verkuil 
569dacca5f0SHans Verkuil 			for (d = 0; d < 8; d++) {
570dacca5f0SHans Verkuil 				out[8 * d] >>= 6;
571dacca5f0SHans Verkuil 				out[8 * d] += 128;
572dacca5f0SHans Verkuil 			}
573dacca5f0SHans Verkuil 		}
574dacca5f0SHans Verkuil 	}
575dacca5f0SHans Verkuil }
576dacca5f0SHans Verkuil 
fill_encoder_block(const u8 * input,s16 * dst,unsigned int stride,unsigned int input_step)577dacca5f0SHans Verkuil static void fill_encoder_block(const u8 *input, s16 *dst,
578dacca5f0SHans Verkuil 			       unsigned int stride, unsigned int input_step)
579dacca5f0SHans Verkuil {
580dacca5f0SHans Verkuil 	int i, j;
581dacca5f0SHans Verkuil 
582dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++) {
583dacca5f0SHans Verkuil 		for (j = 0; j < 8; j++, input += input_step)
584dacca5f0SHans Verkuil 			*dst++ = *input;
585dacca5f0SHans Verkuil 		input += stride - 8 * input_step;
586dacca5f0SHans Verkuil 	}
587dacca5f0SHans Verkuil }
588dacca5f0SHans Verkuil 
var_intra(const s16 * input)589dacca5f0SHans Verkuil static int var_intra(const s16 *input)
590dacca5f0SHans Verkuil {
591dacca5f0SHans Verkuil 	int32_t mean = 0;
592dacca5f0SHans Verkuil 	int32_t ret = 0;
593dacca5f0SHans Verkuil 	const s16 *tmp = input;
594dacca5f0SHans Verkuil 	int i;
595dacca5f0SHans Verkuil 
596dacca5f0SHans Verkuil 	for (i = 0; i < 8 * 8; i++, tmp++)
597dacca5f0SHans Verkuil 		mean += *tmp;
598dacca5f0SHans Verkuil 	mean /= 64;
599dacca5f0SHans Verkuil 	tmp = input;
600dacca5f0SHans Verkuil 	for (i = 0; i < 8 * 8; i++, tmp++)
601dacca5f0SHans Verkuil 		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
602dacca5f0SHans Verkuil 	return ret;
603dacca5f0SHans Verkuil }
604dacca5f0SHans Verkuil 
var_inter(const s16 * old,const s16 * new)605dacca5f0SHans Verkuil static int var_inter(const s16 *old, const s16 *new)
606dacca5f0SHans Verkuil {
607dacca5f0SHans Verkuil 	int32_t ret = 0;
608dacca5f0SHans Verkuil 	int i;
609dacca5f0SHans Verkuil 
610dacca5f0SHans Verkuil 	for (i = 0; i < 8 * 8; i++, old++, new++)
611dacca5f0SHans Verkuil 		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
612dacca5f0SHans Verkuil 	return ret;
613dacca5f0SHans Verkuil }
614dacca5f0SHans Verkuil 
615dacca5f0SHans Verkuil static noinline_for_stack int
decide_blocktype(const u8 * cur,const u8 * reference,s16 * deltablock,unsigned int stride,unsigned int input_step)616dacca5f0SHans Verkuil decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
617dacca5f0SHans Verkuil 		 unsigned int stride, unsigned int input_step)
618dacca5f0SHans Verkuil {
619dacca5f0SHans Verkuil 	s16 tmp[64];
620dacca5f0SHans Verkuil 	s16 old[64];
621dacca5f0SHans Verkuil 	s16 *work = tmp;
622dacca5f0SHans Verkuil 	unsigned int k, l;
623dacca5f0SHans Verkuil 	int vari;
624dacca5f0SHans Verkuil 	int vard;
625dacca5f0SHans Verkuil 
626dacca5f0SHans Verkuil 	fill_encoder_block(cur, tmp, stride, input_step);
627dacca5f0SHans Verkuil 	fill_encoder_block(reference, old, 8, 1);
628dacca5f0SHans Verkuil 	vari = var_intra(tmp);
629dacca5f0SHans Verkuil 
630dacca5f0SHans Verkuil 	for (k = 0; k < 8; k++) {
631dacca5f0SHans Verkuil 		for (l = 0; l < 8; l++) {
632dacca5f0SHans Verkuil 			*deltablock = *work - *reference;
633dacca5f0SHans Verkuil 			deltablock++;
634dacca5f0SHans Verkuil 			work++;
635dacca5f0SHans Verkuil 			reference++;
636dacca5f0SHans Verkuil 		}
637dacca5f0SHans Verkuil 	}
638dacca5f0SHans Verkuil 	deltablock -= 64;
639dacca5f0SHans Verkuil 	vard = var_inter(old, tmp);
640dacca5f0SHans Verkuil 	return vari <= vard ? IBLOCK : PBLOCK;
641dacca5f0SHans Verkuil }
642dacca5f0SHans Verkuil 
fill_decoder_block(u8 * dst,const s16 * input,int stride,unsigned int dst_step)643dacca5f0SHans Verkuil static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
644dacca5f0SHans Verkuil 			       unsigned int dst_step)
645dacca5f0SHans Verkuil {
646dacca5f0SHans Verkuil 	int i, j;
647dacca5f0SHans Verkuil 
648dacca5f0SHans Verkuil 	for (i = 0; i < 8; i++) {
649dacca5f0SHans Verkuil 		for (j = 0; j < 8; j++, input++, dst += dst_step) {
650dacca5f0SHans Verkuil 			if (*input < 0)
651dacca5f0SHans Verkuil 				*dst = 0;
652dacca5f0SHans Verkuil 			else if (*input > 255)
653dacca5f0SHans Verkuil 				*dst = 255;
654dacca5f0SHans Verkuil 			else
655dacca5f0SHans Verkuil 				*dst = *input;
656dacca5f0SHans Verkuil 		}
657dacca5f0SHans Verkuil 		dst += stride - (8 * dst_step);
658dacca5f0SHans Verkuil 	}
659dacca5f0SHans Verkuil }
660dacca5f0SHans Verkuil 
add_deltas(s16 * deltas,const u8 * ref,int stride,unsigned int ref_step)661dacca5f0SHans Verkuil static void add_deltas(s16 *deltas, const u8 *ref, int stride,
662dacca5f0SHans Verkuil 		       unsigned int ref_step)
663dacca5f0SHans Verkuil {
664dacca5f0SHans Verkuil 	int k, l;
665dacca5f0SHans Verkuil 
666dacca5f0SHans Verkuil 	for (k = 0; k < 8; k++) {
667dacca5f0SHans Verkuil 		for (l = 0; l < 8; l++) {
668dacca5f0SHans Verkuil 			*deltas += *ref;
669dacca5f0SHans Verkuil 			ref += ref_step;
670dacca5f0SHans Verkuil 			/*
671dacca5f0SHans Verkuil 			 * Due to quantizing, it might possible that the
672dacca5f0SHans Verkuil 			 * decoded coefficients are slightly out of range
673dacca5f0SHans Verkuil 			 */
674dacca5f0SHans Verkuil 			if (*deltas < 0)
675dacca5f0SHans Verkuil 				*deltas = 0;
676dacca5f0SHans Verkuil 			else if (*deltas > 255)
677dacca5f0SHans Verkuil 				*deltas = 255;
678dacca5f0SHans Verkuil 			deltas++;
679dacca5f0SHans Verkuil 		}
680dacca5f0SHans Verkuil 		ref += stride - (8 * ref_step);
681dacca5f0SHans Verkuil 	}
682dacca5f0SHans Verkuil }
683dacca5f0SHans Verkuil 
encode_plane(u8 * input,u8 * refp,__be16 ** rlco,__be16 * rlco_max,struct fwht_cframe * cf,u32 height,u32 width,u32 stride,unsigned int input_step,bool is_intra,bool next_is_intra)684dacca5f0SHans Verkuil static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
685dacca5f0SHans Verkuil 			struct fwht_cframe *cf, u32 height, u32 width,
686dacca5f0SHans Verkuil 			u32 stride, unsigned int input_step,
687dacca5f0SHans Verkuil 			bool is_intra, bool next_is_intra)
688dacca5f0SHans Verkuil {
689dacca5f0SHans Verkuil 	u8 *input_start = input;
690dacca5f0SHans Verkuil 	__be16 *rlco_start = *rlco;
691dacca5f0SHans Verkuil 	s16 deltablock[64];
692dacca5f0SHans Verkuil 	__be16 pframe_bit = htons(PFRAME_BIT);
693dacca5f0SHans Verkuil 	u32 encoding = 0;
694dacca5f0SHans Verkuil 	unsigned int last_size = 0;
695dacca5f0SHans Verkuil 	unsigned int i, j;
696dacca5f0SHans Verkuil 
697dacca5f0SHans Verkuil 	width = round_up(width, 8);
698dacca5f0SHans Verkuil 	height = round_up(height, 8);
699dacca5f0SHans Verkuil 
700dacca5f0SHans Verkuil 	for (j = 0; j < height / 8; j++) {
701dacca5f0SHans Verkuil 		input = input_start + j * 8 * stride;
702dacca5f0SHans Verkuil 		for (i = 0; i < width / 8; i++) {
703dacca5f0SHans Verkuil 			/* intra code, first frame is always intra coded. */
704dacca5f0SHans Verkuil 			int blocktype = IBLOCK;
705dacca5f0SHans Verkuil 			unsigned int size;
706dacca5f0SHans Verkuil 
707dacca5f0SHans Verkuil 			if (!is_intra)
708dacca5f0SHans Verkuil 				blocktype = decide_blocktype(input, refp,
709dacca5f0SHans Verkuil 					deltablock, stride, input_step);
710dacca5f0SHans Verkuil 			if (blocktype == IBLOCK) {
711dacca5f0SHans Verkuil 				fwht(input, cf->coeffs, stride, input_step, 1);
712dacca5f0SHans Verkuil 				quantize_intra(cf->coeffs, cf->de_coeffs,
713dacca5f0SHans Verkuil 					       cf->i_frame_qp);
714dacca5f0SHans Verkuil 			} else {
715dacca5f0SHans Verkuil 				/* inter code */
716dacca5f0SHans Verkuil 				encoding |= FWHT_FRAME_PCODED;
717dacca5f0SHans Verkuil 				fwht16(deltablock, cf->coeffs, 8, 0);
718dacca5f0SHans Verkuil 				quantize_inter(cf->coeffs, cf->de_coeffs,
719dacca5f0SHans Verkuil 					       cf->p_frame_qp);
720dacca5f0SHans Verkuil 			}
721dacca5f0SHans Verkuil 			if (!next_is_intra) {
722dacca5f0SHans Verkuil 				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
723dacca5f0SHans Verkuil 
724dacca5f0SHans Verkuil 				if (blocktype == PBLOCK)
725dacca5f0SHans Verkuil 					add_deltas(cf->de_fwht, refp, 8, 1);
726dacca5f0SHans Verkuil 				fill_decoder_block(refp, cf->de_fwht, 8, 1);
727dacca5f0SHans Verkuil 			}
728dacca5f0SHans Verkuil 
729dacca5f0SHans Verkuil 			input += 8 * input_step;
730dacca5f0SHans Verkuil 			refp += 8 * 8;
731dacca5f0SHans Verkuil 
732dacca5f0SHans Verkuil 			size = rlc(cf->coeffs, *rlco, blocktype);
733dacca5f0SHans Verkuil 			if (last_size == size &&
734dacca5f0SHans Verkuil 			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
735dacca5f0SHans Verkuil 				__be16 *last_rlco = *rlco - size;
736dacca5f0SHans Verkuil 				s16 hdr = ntohs(*last_rlco);
737dacca5f0SHans Verkuil 
738dacca5f0SHans Verkuil 				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
739dacca5f0SHans Verkuil 				    (hdr & DUPS_MASK) < DUPS_MASK)
740dacca5f0SHans Verkuil 					*last_rlco = htons(hdr + 2);
741dacca5f0SHans Verkuil 				else
742dacca5f0SHans Verkuil 					*rlco += size;
743dacca5f0SHans Verkuil 			} else {
744dacca5f0SHans Verkuil 				*rlco += size;
745dacca5f0SHans Verkuil 			}
746dacca5f0SHans Verkuil 			if (*rlco >= rlco_max) {
747dacca5f0SHans Verkuil 				encoding |= FWHT_FRAME_UNENCODED;
748dacca5f0SHans Verkuil 				goto exit_loop;
749dacca5f0SHans Verkuil 			}
750dacca5f0SHans Verkuil 			last_size = size;
751dacca5f0SHans Verkuil 		}
752dacca5f0SHans Verkuil 	}
753dacca5f0SHans Verkuil 
754dacca5f0SHans Verkuil exit_loop:
755dacca5f0SHans Verkuil 	if (encoding & FWHT_FRAME_UNENCODED) {
756dacca5f0SHans Verkuil 		u8 *out = (u8 *)rlco_start;
757dacca5f0SHans Verkuil 		u8 *p;
758dacca5f0SHans Verkuil 
759dacca5f0SHans Verkuil 		input = input_start;
760dacca5f0SHans Verkuil 		/*
761dacca5f0SHans Verkuil 		 * The compressed stream should never contain the magic
762dacca5f0SHans Verkuil 		 * header, so when we copy the YUV data we replace 0xff
763dacca5f0SHans Verkuil 		 * by 0xfe. Since YUV is limited range such values
764dacca5f0SHans Verkuil 		 * shouldn't appear anyway.
765dacca5f0SHans Verkuil 		 */
766dacca5f0SHans Verkuil 		for (j = 0; j < height; j++) {
767dacca5f0SHans Verkuil 			for (i = 0, p = input; i < width; i++, p += input_step)
768dacca5f0SHans Verkuil 				*out++ = (*p == 0xff) ? 0xfe : *p;
769dacca5f0SHans Verkuil 			input += stride;
770dacca5f0SHans Verkuil 		}
771dacca5f0SHans Verkuil 		*rlco = (__be16 *)out;
772dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_PCODED;
773dacca5f0SHans Verkuil 	}
774dacca5f0SHans Verkuil 	return encoding;
775dacca5f0SHans Verkuil }
776dacca5f0SHans Verkuil 
fwht_encode_frame(struct fwht_raw_frame * frm,struct fwht_raw_frame * ref_frm,struct fwht_cframe * cf,bool is_intra,bool next_is_intra,unsigned int width,unsigned int height,unsigned int stride,unsigned int chroma_stride)777dacca5f0SHans Verkuil u32 fwht_encode_frame(struct fwht_raw_frame *frm,
778dacca5f0SHans Verkuil 		      struct fwht_raw_frame *ref_frm,
779dacca5f0SHans Verkuil 		      struct fwht_cframe *cf,
780dacca5f0SHans Verkuil 		      bool is_intra, bool next_is_intra,
781dacca5f0SHans Verkuil 		      unsigned int width, unsigned int height,
782dacca5f0SHans Verkuil 		      unsigned int stride, unsigned int chroma_stride)
783dacca5f0SHans Verkuil {
784dacca5f0SHans Verkuil 	unsigned int size = height * width;
785dacca5f0SHans Verkuil 	__be16 *rlco = cf->rlc_data;
786dacca5f0SHans Verkuil 	__be16 *rlco_max;
787dacca5f0SHans Verkuil 	u32 encoding;
788dacca5f0SHans Verkuil 
789dacca5f0SHans Verkuil 	rlco_max = rlco + size / 2 - 256;
790dacca5f0SHans Verkuil 	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
791dacca5f0SHans Verkuil 				height, width, stride,
792dacca5f0SHans Verkuil 				frm->luma_alpha_step, is_intra, next_is_intra);
793dacca5f0SHans Verkuil 	if (encoding & FWHT_FRAME_UNENCODED)
794dacca5f0SHans Verkuil 		encoding |= FWHT_LUMA_UNENCODED;
795dacca5f0SHans Verkuil 	encoding &= ~FWHT_FRAME_UNENCODED;
796dacca5f0SHans Verkuil 
797dacca5f0SHans Verkuil 	if (frm->components_num >= 3) {
798dacca5f0SHans Verkuil 		u32 chroma_h = height / frm->height_div;
799dacca5f0SHans Verkuil 		u32 chroma_w = width / frm->width_div;
800dacca5f0SHans Verkuil 		unsigned int chroma_size = chroma_h * chroma_w;
801dacca5f0SHans Verkuil 
802dacca5f0SHans Verkuil 		rlco_max = rlco + chroma_size / 2 - 256;
803dacca5f0SHans Verkuil 		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
804dacca5f0SHans Verkuil 					 cf, chroma_h, chroma_w,
805dacca5f0SHans Verkuil 					 chroma_stride, frm->chroma_step,
806dacca5f0SHans Verkuil 					 is_intra, next_is_intra);
807dacca5f0SHans Verkuil 		if (encoding & FWHT_FRAME_UNENCODED)
808dacca5f0SHans Verkuil 			encoding |= FWHT_CB_UNENCODED;
809dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_UNENCODED;
810dacca5f0SHans Verkuil 		rlco_max = rlco + chroma_size / 2 - 256;
811dacca5f0SHans Verkuil 		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
812dacca5f0SHans Verkuil 					 cf, chroma_h, chroma_w,
813dacca5f0SHans Verkuil 					 chroma_stride, frm->chroma_step,
814dacca5f0SHans Verkuil 					 is_intra, next_is_intra);
815dacca5f0SHans Verkuil 		if (encoding & FWHT_FRAME_UNENCODED)
816dacca5f0SHans Verkuil 			encoding |= FWHT_CR_UNENCODED;
817dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_UNENCODED;
818dacca5f0SHans Verkuil 	}
819dacca5f0SHans Verkuil 
820dacca5f0SHans Verkuil 	if (frm->components_num == 4) {
821dacca5f0SHans Verkuil 		rlco_max = rlco + size / 2 - 256;
822dacca5f0SHans Verkuil 		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
823dacca5f0SHans Verkuil 					 rlco_max, cf, height, width,
824dacca5f0SHans Verkuil 					 stride, frm->luma_alpha_step,
825dacca5f0SHans Verkuil 					 is_intra, next_is_intra);
826dacca5f0SHans Verkuil 		if (encoding & FWHT_FRAME_UNENCODED)
827dacca5f0SHans Verkuil 			encoding |= FWHT_ALPHA_UNENCODED;
828dacca5f0SHans Verkuil 		encoding &= ~FWHT_FRAME_UNENCODED;
829dacca5f0SHans Verkuil 	}
830dacca5f0SHans Verkuil 
831dacca5f0SHans Verkuil 	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
832dacca5f0SHans Verkuil 	return encoding;
833dacca5f0SHans Verkuil }
834dacca5f0SHans Verkuil 
decode_plane(struct fwht_cframe * cf,const __be16 ** rlco,u32 height,u32 width,const u8 * ref,u32 ref_stride,unsigned int ref_step,u8 * dst,unsigned int dst_stride,unsigned int dst_step,bool uncompressed,const __be16 * end_of_rlco_buf)835dacca5f0SHans Verkuil static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
836dacca5f0SHans Verkuil 			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
837dacca5f0SHans Verkuil 			 unsigned int ref_step, u8 *dst,
838dacca5f0SHans Verkuil 			 unsigned int dst_stride, unsigned int dst_step,
839dacca5f0SHans Verkuil 			 bool uncompressed, const __be16 *end_of_rlco_buf)
840dacca5f0SHans Verkuil {
841dacca5f0SHans Verkuil 	unsigned int copies = 0;
842dacca5f0SHans Verkuil 	s16 copy[8 * 8];
843dacca5f0SHans Verkuil 	u16 stat;
844dacca5f0SHans Verkuil 	unsigned int i, j;
845dacca5f0SHans Verkuil 	bool is_intra = !ref;
846dacca5f0SHans Verkuil 
847dacca5f0SHans Verkuil 	width = round_up(width, 8);
848dacca5f0SHans Verkuil 	height = round_up(height, 8);
849dacca5f0SHans Verkuil 
850dacca5f0SHans Verkuil 	if (uncompressed) {
851dacca5f0SHans Verkuil 		int i;
852dacca5f0SHans Verkuil 
853dacca5f0SHans Verkuil 		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
854dacca5f0SHans Verkuil 			return false;
855dacca5f0SHans Verkuil 		for (i = 0; i < height; i++) {
856dacca5f0SHans Verkuil 			memcpy(dst, *rlco, width);
857dacca5f0SHans Verkuil 			dst += dst_stride;
858dacca5f0SHans Verkuil 			*rlco += width / 2;
859dacca5f0SHans Verkuil 		}
860dacca5f0SHans Verkuil 		return true;
861dacca5f0SHans Verkuil 	}
862dacca5f0SHans Verkuil 
863dacca5f0SHans Verkuil 	/*
864dacca5f0SHans Verkuil 	 * When decoding each macroblock the rlco pointer will be increased
865dacca5f0SHans Verkuil 	 * by 65 * 2 bytes worst-case.
866dacca5f0SHans Verkuil 	 * To avoid overflow the buffer has to be 65/64th of the actual raw
867dacca5f0SHans Verkuil 	 * image size, just in case someone feeds it malicious data.
868dacca5f0SHans Verkuil 	 */
869dacca5f0SHans Verkuil 	for (j = 0; j < height / 8; j++) {
870dacca5f0SHans Verkuil 		for (i = 0; i < width / 8; i++) {
871dacca5f0SHans Verkuil 			const u8 *refp = ref + j * 8 * ref_stride +
872dacca5f0SHans Verkuil 				i * 8 * ref_step;
873dacca5f0SHans Verkuil 			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
874dacca5f0SHans Verkuil 
875dacca5f0SHans Verkuil 			if (copies) {
876dacca5f0SHans Verkuil 				memcpy(cf->de_fwht, copy, sizeof(copy));
877dacca5f0SHans Verkuil 				if ((stat & PFRAME_BIT) && !is_intra)
878dacca5f0SHans Verkuil 					add_deltas(cf->de_fwht, refp,
879dacca5f0SHans Verkuil 						   ref_stride, ref_step);
880dacca5f0SHans Verkuil 				fill_decoder_block(dstp, cf->de_fwht,
881dacca5f0SHans Verkuil 						   dst_stride, dst_step);
882dacca5f0SHans Verkuil 				copies--;
883dacca5f0SHans Verkuil 				continue;
884dacca5f0SHans Verkuil 			}
885dacca5f0SHans Verkuil 
886dacca5f0SHans Verkuil 			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
887dacca5f0SHans Verkuil 			if (stat & OVERFLOW_BIT)
888dacca5f0SHans Verkuil 				return false;
889dacca5f0SHans Verkuil 			if ((stat & PFRAME_BIT) && !is_intra)
890dacca5f0SHans Verkuil 				dequantize_inter(cf->coeffs);
891dacca5f0SHans Verkuil 			else
892dacca5f0SHans Verkuil 				dequantize_intra(cf->coeffs);
893dacca5f0SHans Verkuil 
894dacca5f0SHans Verkuil 			ifwht(cf->coeffs, cf->de_fwht,
895dacca5f0SHans Verkuil 			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
896dacca5f0SHans Verkuil 
897dacca5f0SHans Verkuil 			copies = (stat & DUPS_MASK) >> 1;
898dacca5f0SHans Verkuil 			if (copies)
899dacca5f0SHans Verkuil 				memcpy(copy, cf->de_fwht, sizeof(copy));
900dacca5f0SHans Verkuil 			if ((stat & PFRAME_BIT) && !is_intra)
901dacca5f0SHans Verkuil 				add_deltas(cf->de_fwht, refp,
902dacca5f0SHans Verkuil 					   ref_stride, ref_step);
903dacca5f0SHans Verkuil 			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
904dacca5f0SHans Verkuil 					   dst_step);
905dacca5f0SHans Verkuil 		}
906dacca5f0SHans Verkuil 	}
907dacca5f0SHans Verkuil 	return true;
908dacca5f0SHans Verkuil }
909dacca5f0SHans Verkuil 
fwht_decode_frame(struct fwht_cframe * cf,u32 hdr_flags,unsigned int components_num,unsigned int width,unsigned int height,const struct fwht_raw_frame * ref,unsigned int ref_stride,unsigned int ref_chroma_stride,struct fwht_raw_frame * dst,unsigned int dst_stride,unsigned int dst_chroma_stride)910dacca5f0SHans Verkuil bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
911dacca5f0SHans Verkuil 		       unsigned int components_num, unsigned int width,
912dacca5f0SHans Verkuil 		       unsigned int height, const struct fwht_raw_frame *ref,
913dacca5f0SHans Verkuil 		       unsigned int ref_stride, unsigned int ref_chroma_stride,
914dacca5f0SHans Verkuil 		       struct fwht_raw_frame *dst, unsigned int dst_stride,
915dacca5f0SHans Verkuil 		       unsigned int dst_chroma_stride)
916dacca5f0SHans Verkuil {
917dacca5f0SHans Verkuil 	const __be16 *rlco = cf->rlc_data;
918dacca5f0SHans Verkuil 	const __be16 *end_of_rlco_buf = cf->rlc_data +
919dacca5f0SHans Verkuil 			(cf->size / sizeof(*rlco)) - 1;
920dacca5f0SHans Verkuil 
921dacca5f0SHans Verkuil 	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
922dacca5f0SHans Verkuil 			  ref->luma_alpha_step, dst->luma, dst_stride,
923dacca5f0SHans Verkuil 			  dst->luma_alpha_step,
9243abfc314SHans Verkuil 			  hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
925dacca5f0SHans Verkuil 			  end_of_rlco_buf))
926dacca5f0SHans Verkuil 		return false;
927dacca5f0SHans Verkuil 
928dacca5f0SHans Verkuil 	if (components_num >= 3) {
929dacca5f0SHans Verkuil 		u32 h = height;
930dacca5f0SHans Verkuil 		u32 w = width;
931dacca5f0SHans Verkuil 
9323abfc314SHans Verkuil 		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
933dacca5f0SHans Verkuil 			h /= 2;
9343abfc314SHans Verkuil 		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
935dacca5f0SHans Verkuil 			w /= 2;
936dacca5f0SHans Verkuil 
937dacca5f0SHans Verkuil 		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
938dacca5f0SHans Verkuil 				  ref->chroma_step, dst->cb, dst_chroma_stride,
939dacca5f0SHans Verkuil 				  dst->chroma_step,
9403abfc314SHans Verkuil 				  hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
941dacca5f0SHans Verkuil 				  end_of_rlco_buf))
942dacca5f0SHans Verkuil 			return false;
943dacca5f0SHans Verkuil 		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
944dacca5f0SHans Verkuil 				  ref->chroma_step, dst->cr, dst_chroma_stride,
945dacca5f0SHans Verkuil 				  dst->chroma_step,
9463abfc314SHans Verkuil 				  hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
947dacca5f0SHans Verkuil 				  end_of_rlco_buf))
948dacca5f0SHans Verkuil 			return false;
949dacca5f0SHans Verkuil 	}
950dacca5f0SHans Verkuil 
951dacca5f0SHans Verkuil 	if (components_num == 4)
952dacca5f0SHans Verkuil 		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
953dacca5f0SHans Verkuil 				  ref->luma_alpha_step, dst->alpha, dst_stride,
954dacca5f0SHans Verkuil 				  dst->luma_alpha_step,
9553abfc314SHans Verkuil 				  hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
956dacca5f0SHans Verkuil 				  end_of_rlco_buf))
957dacca5f0SHans Verkuil 			return false;
958dacca5f0SHans Verkuil 	return true;
959dacca5f0SHans Verkuil }
960