1 // SPDX-License-Identifier: LGPL-2.1+
2 /*
3  * Copyright 2016 Tom aan de Wiel
4  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *
6  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7  *
8  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9  * R.D. Brown, 1977
10  */
11 
12 #include <linux/string.h>
13 #include <linux/kernel.h>
14 #include "codec-fwht.h"
15 
16 #define OVERFLOW_BIT BIT(14)
17 
18 /*
19  * Note: bit 0 of the header must always be 0. Otherwise it cannot
20  * be guaranteed that the magic 8 byte sequence (see below) can
21  * never occur in the rlc output.
22  */
23 #define PFRAME_BIT BIT(15)
24 #define DUPS_MASK 0x1ffe
25 
26 #define PBLOCK 0
27 #define IBLOCK 1
28 
29 #define ALL_ZEROS 15
30 
31 static const uint8_t zigzag[64] = {
32 	0,
33 	1,  8,
34 	2,  9, 16,
35 	3, 10, 17, 24,
36 	4, 11, 18, 25, 32,
37 	5, 12, 19, 26, 33, 40,
38 	6, 13, 20, 27, 34, 41, 48,
39 	7, 14, 21, 28, 35, 42, 49, 56,
40 	15, 22, 29, 36, 43, 50, 57,
41 	23, 30, 37, 44, 51, 58,
42 	31, 38, 45, 52, 59,
43 	39, 46, 53, 60,
44 	47, 54, 61,
45 	55, 62,
46 	63,
47 };
48 
49 /*
50  * noinline_for_stack to work around
51  * https://bugs.llvm.org/show_bug.cgi?id=38809
52  */
53 static int noinline_for_stack
54 rlc(const s16 *in, __be16 *output, int blocktype)
55 {
56 	s16 block[8 * 8];
57 	s16 *wp = block;
58 	int i = 0;
59 	int x, y;
60 	int ret = 0;
61 
62 	/* read in block from framebuffer */
63 	int lastzero_run = 0;
64 	int to_encode;
65 
66 	for (y = 0; y < 8; y++) {
67 		for (x = 0; x < 8; x++) {
68 			*wp = in[x + y * 8];
69 			wp++;
70 		}
71 	}
72 
73 	/* keep track of amount of trailing zeros */
74 	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
75 		lastzero_run++;
76 
77 	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
78 	ret++;
79 
80 	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
81 
82 	i = 0;
83 	while (i < to_encode) {
84 		int cnt = 0;
85 		int tmp;
86 
87 		/* count leading zeros */
88 		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
89 			cnt++;
90 			i++;
91 			if (i == to_encode) {
92 				cnt--;
93 				break;
94 			}
95 		}
96 		/* 4 bits for run, 12 for coefficient (quantization by 4) */
97 		*output++ = htons((cnt | tmp << 4));
98 		i++;
99 		ret++;
100 	}
101 	if (lastzero_run > 14) {
102 		*output = htons(ALL_ZEROS | 0);
103 		ret++;
104 	}
105 
106 	return ret;
107 }
108 
109 /*
110  * This function will worst-case increase rlc_in by 65*2 bytes:
111  * one s16 value for the header and 8 * 8 coefficients of type s16.
112  */
113 static noinline_for_stack u16
114 derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
115 {
116 	/* header */
117 	const __be16 *input = *rlc_in;
118 	u16 stat;
119 	int dec_count = 0;
120 	s16 block[8 * 8 + 16];
121 	s16 *wp = block;
122 	int i;
123 
124 	if (input > end_of_input)
125 		return OVERFLOW_BIT;
126 	stat = ntohs(*input++);
127 
128 	/*
129 	 * Now de-compress, it expands one byte to up to 15 bytes
130 	 * (or fills the remainder of the 64 bytes with zeroes if it
131 	 * is the last byte to expand).
132 	 *
133 	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
134 	 * allow for overflow if the incoming data was malformed.
135 	 */
136 	while (dec_count < 8 * 8) {
137 		s16 in;
138 		int length;
139 		int coeff;
140 
141 		if (input > end_of_input)
142 			return OVERFLOW_BIT;
143 		in = ntohs(*input++);
144 		length = in & 0xf;
145 		coeff = in >> 4;
146 
147 		/* fill remainder with zeros */
148 		if (length == 15) {
149 			for (i = 0; i < 64 - dec_count; i++)
150 				*wp++ = 0;
151 			break;
152 		}
153 
154 		for (i = 0; i < length; i++)
155 			*wp++ = 0;
156 		*wp++ = coeff;
157 		dec_count += length + 1;
158 	}
159 
160 	wp = block;
161 
162 	for (i = 0; i < 64; i++) {
163 		int pos = zigzag[i];
164 		int y = pos / 8;
165 		int x = pos % 8;
166 
167 		dwht_out[x + y * 8] = *wp++;
168 	}
169 	*rlc_in = input;
170 	return stat;
171 }
172 
173 static const int quant_table[] = {
174 	2, 2, 2, 2, 2, 2,  2,  2,
175 	2, 2, 2, 2, 2, 2,  2,  2,
176 	2, 2, 2, 2, 2, 2,  2,  3,
177 	2, 2, 2, 2, 2, 2,  3,  6,
178 	2, 2, 2, 2, 2, 3,  6,  6,
179 	2, 2, 2, 2, 3, 6,  6,  6,
180 	2, 2, 2, 3, 6, 6,  6,  6,
181 	2, 2, 3, 6, 6, 6,  6,  8,
182 };
183 
184 static const int quant_table_p[] = {
185 	3, 3, 3, 3, 3, 3,  3,  3,
186 	3, 3, 3, 3, 3, 3,  3,  3,
187 	3, 3, 3, 3, 3, 3,  3,  3,
188 	3, 3, 3, 3, 3, 3,  3,  6,
189 	3, 3, 3, 3, 3, 3,  6,  6,
190 	3, 3, 3, 3, 3, 6,  6,  9,
191 	3, 3, 3, 3, 6, 6,  9,  9,
192 	3, 3, 3, 6, 6, 9,  9,  10,
193 };
194 
195 static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
196 {
197 	const int *quant = quant_table;
198 	int i, j;
199 
200 	for (j = 0; j < 8; j++) {
201 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
202 			*coeff >>= *quant;
203 			if (*coeff >= -qp && *coeff <= qp)
204 				*coeff = *de_coeff = 0;
205 			else
206 				*de_coeff = *coeff << *quant;
207 		}
208 	}
209 }
210 
211 static void dequantize_intra(s16 *coeff)
212 {
213 	const int *quant = quant_table;
214 	int i, j;
215 
216 	for (j = 0; j < 8; j++)
217 		for (i = 0; i < 8; i++, quant++, coeff++)
218 			*coeff <<= *quant;
219 }
220 
221 static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
222 {
223 	const int *quant = quant_table_p;
224 	int i, j;
225 
226 	for (j = 0; j < 8; j++) {
227 		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
228 			*coeff >>= *quant;
229 			if (*coeff >= -qp && *coeff <= qp)
230 				*coeff = *de_coeff = 0;
231 			else
232 				*de_coeff = *coeff << *quant;
233 		}
234 	}
235 }
236 
237 static void dequantize_inter(s16 *coeff)
238 {
239 	const int *quant = quant_table_p;
240 	int i, j;
241 
242 	for (j = 0; j < 8; j++)
243 		for (i = 0; i < 8; i++, quant++, coeff++)
244 			*coeff <<= *quant;
245 }
246 
247 static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
248 				    unsigned int stride,
249 				    unsigned int input_step, bool intra)
250 {
251 	/* we'll need more than 8 bits for the transformed coefficients */
252 	s32 workspace1[8], workspace2[8];
253 	const u8 *tmp = block;
254 	s16 *out = output_block;
255 	int add = intra ? 256 : 0;
256 	unsigned int i;
257 
258 	/* stage 1 */
259 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
260 		switch (input_step) {
261 		case 1:
262 			workspace1[0]  = tmp[0] + tmp[1] - add;
263 			workspace1[1]  = tmp[0] - tmp[1];
264 
265 			workspace1[2]  = tmp[2] + tmp[3] - add;
266 			workspace1[3]  = tmp[2] - tmp[3];
267 
268 			workspace1[4]  = tmp[4] + tmp[5] - add;
269 			workspace1[5]  = tmp[4] - tmp[5];
270 
271 			workspace1[6]  = tmp[6] + tmp[7] - add;
272 			workspace1[7]  = tmp[6] - tmp[7];
273 			break;
274 		case 2:
275 			workspace1[0]  = tmp[0] + tmp[2] - add;
276 			workspace1[1]  = tmp[0] - tmp[2];
277 
278 			workspace1[2]  = tmp[4] + tmp[6] - add;
279 			workspace1[3]  = tmp[4] - tmp[6];
280 
281 			workspace1[4]  = tmp[8] + tmp[10] - add;
282 			workspace1[5]  = tmp[8] - tmp[10];
283 
284 			workspace1[6]  = tmp[12] + tmp[14] - add;
285 			workspace1[7]  = tmp[12] - tmp[14];
286 			break;
287 		case 3:
288 			workspace1[0]  = tmp[0] + tmp[3] - add;
289 			workspace1[1]  = tmp[0] - tmp[3];
290 
291 			workspace1[2]  = tmp[6] + tmp[9] - add;
292 			workspace1[3]  = tmp[6] - tmp[9];
293 
294 			workspace1[4]  = tmp[12] + tmp[15] - add;
295 			workspace1[5]  = tmp[12] - tmp[15];
296 
297 			workspace1[6]  = tmp[18] + tmp[21] - add;
298 			workspace1[7]  = tmp[18] - tmp[21];
299 			break;
300 		default:
301 			workspace1[0]  = tmp[0] + tmp[4] - add;
302 			workspace1[1]  = tmp[0] - tmp[4];
303 
304 			workspace1[2]  = tmp[8] + tmp[12] - add;
305 			workspace1[3]  = tmp[8] - tmp[12];
306 
307 			workspace1[4]  = tmp[16] + tmp[20] - add;
308 			workspace1[5]  = tmp[16] - tmp[20];
309 
310 			workspace1[6]  = tmp[24] + tmp[28] - add;
311 			workspace1[7]  = tmp[24] - tmp[28];
312 			break;
313 		}
314 
315 		/* stage 2 */
316 		workspace2[0] = workspace1[0] + workspace1[2];
317 		workspace2[1] = workspace1[0] - workspace1[2];
318 		workspace2[2] = workspace1[1] - workspace1[3];
319 		workspace2[3] = workspace1[1] + workspace1[3];
320 
321 		workspace2[4] = workspace1[4] + workspace1[6];
322 		workspace2[5] = workspace1[4] - workspace1[6];
323 		workspace2[6] = workspace1[5] - workspace1[7];
324 		workspace2[7] = workspace1[5] + workspace1[7];
325 
326 		/* stage 3 */
327 		out[0] = workspace2[0] + workspace2[4];
328 		out[1] = workspace2[0] - workspace2[4];
329 		out[2] = workspace2[1] - workspace2[5];
330 		out[3] = workspace2[1] + workspace2[5];
331 		out[4] = workspace2[2] + workspace2[6];
332 		out[5] = workspace2[2] - workspace2[6];
333 		out[6] = workspace2[3] - workspace2[7];
334 		out[7] = workspace2[3] + workspace2[7];
335 	}
336 
337 	out = output_block;
338 
339 	for (i = 0; i < 8; i++, out++) {
340 		/* stage 1 */
341 		workspace1[0]  = out[0] + out[1 * 8];
342 		workspace1[1]  = out[0] - out[1 * 8];
343 
344 		workspace1[2]  = out[2 * 8] + out[3 * 8];
345 		workspace1[3]  = out[2 * 8] - out[3 * 8];
346 
347 		workspace1[4]  = out[4 * 8] + out[5 * 8];
348 		workspace1[5]  = out[4 * 8] - out[5 * 8];
349 
350 		workspace1[6]  = out[6 * 8] + out[7 * 8];
351 		workspace1[7]  = out[6 * 8] - out[7 * 8];
352 
353 		/* stage 2 */
354 		workspace2[0] = workspace1[0] + workspace1[2];
355 		workspace2[1] = workspace1[0] - workspace1[2];
356 		workspace2[2] = workspace1[1] - workspace1[3];
357 		workspace2[3] = workspace1[1] + workspace1[3];
358 
359 		workspace2[4] = workspace1[4] + workspace1[6];
360 		workspace2[5] = workspace1[4] - workspace1[6];
361 		workspace2[6] = workspace1[5] - workspace1[7];
362 		workspace2[7] = workspace1[5] + workspace1[7];
363 		/* stage 3 */
364 		out[0 * 8] = workspace2[0] + workspace2[4];
365 		out[1 * 8] = workspace2[0] - workspace2[4];
366 		out[2 * 8] = workspace2[1] - workspace2[5];
367 		out[3 * 8] = workspace2[1] + workspace2[5];
368 		out[4 * 8] = workspace2[2] + workspace2[6];
369 		out[5 * 8] = workspace2[2] - workspace2[6];
370 		out[6 * 8] = workspace2[3] - workspace2[7];
371 		out[7 * 8] = workspace2[3] + workspace2[7];
372 	}
373 }
374 
375 /*
376  * Not the nicest way of doing it, but P-blocks get twice the range of
377  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
378  * Furthermore values can be negative... This is just a version that
379  * works with 16 signed data
380  */
381 static void noinline_for_stack
382 fwht16(const s16 *block, s16 *output_block, int stride, int intra)
383 {
384 	/* we'll need more than 8 bits for the transformed coefficients */
385 	s32 workspace1[8], workspace2[8];
386 	const s16 *tmp = block;
387 	s16 *out = output_block;
388 	int i;
389 
390 	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
391 		/* stage 1 */
392 		workspace1[0]  = tmp[0] + tmp[1];
393 		workspace1[1]  = tmp[0] - tmp[1];
394 
395 		workspace1[2]  = tmp[2] + tmp[3];
396 		workspace1[3]  = tmp[2] - tmp[3];
397 
398 		workspace1[4]  = tmp[4] + tmp[5];
399 		workspace1[5]  = tmp[4] - tmp[5];
400 
401 		workspace1[6]  = tmp[6] + tmp[7];
402 		workspace1[7]  = tmp[6] - tmp[7];
403 
404 		/* stage 2 */
405 		workspace2[0] = workspace1[0] + workspace1[2];
406 		workspace2[1] = workspace1[0] - workspace1[2];
407 		workspace2[2] = workspace1[1] - workspace1[3];
408 		workspace2[3] = workspace1[1] + workspace1[3];
409 
410 		workspace2[4] = workspace1[4] + workspace1[6];
411 		workspace2[5] = workspace1[4] - workspace1[6];
412 		workspace2[6] = workspace1[5] - workspace1[7];
413 		workspace2[7] = workspace1[5] + workspace1[7];
414 
415 		/* stage 3 */
416 		out[0] = workspace2[0] + workspace2[4];
417 		out[1] = workspace2[0] - workspace2[4];
418 		out[2] = workspace2[1] - workspace2[5];
419 		out[3] = workspace2[1] + workspace2[5];
420 		out[4] = workspace2[2] + workspace2[6];
421 		out[5] = workspace2[2] - workspace2[6];
422 		out[6] = workspace2[3] - workspace2[7];
423 		out[7] = workspace2[3] + workspace2[7];
424 	}
425 
426 	out = output_block;
427 
428 	for (i = 0; i < 8; i++, out++) {
429 		/* stage 1 */
430 		workspace1[0]  = out[0] + out[1*8];
431 		workspace1[1]  = out[0] - out[1*8];
432 
433 		workspace1[2]  = out[2*8] + out[3*8];
434 		workspace1[3]  = out[2*8] - out[3*8];
435 
436 		workspace1[4]  = out[4*8] + out[5*8];
437 		workspace1[5]  = out[4*8] - out[5*8];
438 
439 		workspace1[6]  = out[6*8] + out[7*8];
440 		workspace1[7]  = out[6*8] - out[7*8];
441 
442 		/* stage 2 */
443 		workspace2[0] = workspace1[0] + workspace1[2];
444 		workspace2[1] = workspace1[0] - workspace1[2];
445 		workspace2[2] = workspace1[1] - workspace1[3];
446 		workspace2[3] = workspace1[1] + workspace1[3];
447 
448 		workspace2[4] = workspace1[4] + workspace1[6];
449 		workspace2[5] = workspace1[4] - workspace1[6];
450 		workspace2[6] = workspace1[5] - workspace1[7];
451 		workspace2[7] = workspace1[5] + workspace1[7];
452 
453 		/* stage 3 */
454 		out[0*8] = workspace2[0] + workspace2[4];
455 		out[1*8] = workspace2[0] - workspace2[4];
456 		out[2*8] = workspace2[1] - workspace2[5];
457 		out[3*8] = workspace2[1] + workspace2[5];
458 		out[4*8] = workspace2[2] + workspace2[6];
459 		out[5*8] = workspace2[2] - workspace2[6];
460 		out[6*8] = workspace2[3] - workspace2[7];
461 		out[7*8] = workspace2[3] + workspace2[7];
462 	}
463 }
464 
465 static noinline_for_stack void
466 ifwht(const s16 *block, s16 *output_block, int intra)
467 {
468 	/*
469 	 * we'll need more than 8 bits for the transformed coefficients
470 	 * use native unit of cpu
471 	 */
472 	int workspace1[8], workspace2[8];
473 	int inter = intra ? 0 : 1;
474 	const s16 *tmp = block;
475 	s16 *out = output_block;
476 	int i;
477 
478 	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
479 		/* stage 1 */
480 		workspace1[0]  = tmp[0] + tmp[1];
481 		workspace1[1]  = tmp[0] - tmp[1];
482 
483 		workspace1[2]  = tmp[2] + tmp[3];
484 		workspace1[3]  = tmp[2] - tmp[3];
485 
486 		workspace1[4]  = tmp[4] + tmp[5];
487 		workspace1[5]  = tmp[4] - tmp[5];
488 
489 		workspace1[6]  = tmp[6] + tmp[7];
490 		workspace1[7]  = tmp[6] - tmp[7];
491 
492 		/* stage 2 */
493 		workspace2[0] = workspace1[0] + workspace1[2];
494 		workspace2[1] = workspace1[0] - workspace1[2];
495 		workspace2[2] = workspace1[1] - workspace1[3];
496 		workspace2[3] = workspace1[1] + workspace1[3];
497 
498 		workspace2[4] = workspace1[4] + workspace1[6];
499 		workspace2[5] = workspace1[4] - workspace1[6];
500 		workspace2[6] = workspace1[5] - workspace1[7];
501 		workspace2[7] = workspace1[5] + workspace1[7];
502 
503 		/* stage 3 */
504 		out[0] = workspace2[0] + workspace2[4];
505 		out[1] = workspace2[0] - workspace2[4];
506 		out[2] = workspace2[1] - workspace2[5];
507 		out[3] = workspace2[1] + workspace2[5];
508 		out[4] = workspace2[2] + workspace2[6];
509 		out[5] = workspace2[2] - workspace2[6];
510 		out[6] = workspace2[3] - workspace2[7];
511 		out[7] = workspace2[3] + workspace2[7];
512 	}
513 
514 	out = output_block;
515 
516 	for (i = 0; i < 8; i++, out++) {
517 		/* stage 1 */
518 		workspace1[0]  = out[0] + out[1 * 8];
519 		workspace1[1]  = out[0] - out[1 * 8];
520 
521 		workspace1[2]  = out[2 * 8] + out[3 * 8];
522 		workspace1[3]  = out[2 * 8] - out[3 * 8];
523 
524 		workspace1[4]  = out[4 * 8] + out[5 * 8];
525 		workspace1[5]  = out[4 * 8] - out[5 * 8];
526 
527 		workspace1[6]  = out[6 * 8] + out[7 * 8];
528 		workspace1[7]  = out[6 * 8] - out[7 * 8];
529 
530 		/* stage 2 */
531 		workspace2[0] = workspace1[0] + workspace1[2];
532 		workspace2[1] = workspace1[0] - workspace1[2];
533 		workspace2[2] = workspace1[1] - workspace1[3];
534 		workspace2[3] = workspace1[1] + workspace1[3];
535 
536 		workspace2[4] = workspace1[4] + workspace1[6];
537 		workspace2[5] = workspace1[4] - workspace1[6];
538 		workspace2[6] = workspace1[5] - workspace1[7];
539 		workspace2[7] = workspace1[5] + workspace1[7];
540 
541 		/* stage 3 */
542 		if (inter) {
543 			int d;
544 
545 			out[0 * 8] = workspace2[0] + workspace2[4];
546 			out[1 * 8] = workspace2[0] - workspace2[4];
547 			out[2 * 8] = workspace2[1] - workspace2[5];
548 			out[3 * 8] = workspace2[1] + workspace2[5];
549 			out[4 * 8] = workspace2[2] + workspace2[6];
550 			out[5 * 8] = workspace2[2] - workspace2[6];
551 			out[6 * 8] = workspace2[3] - workspace2[7];
552 			out[7 * 8] = workspace2[3] + workspace2[7];
553 
554 			for (d = 0; d < 8; d++)
555 				out[8 * d] >>= 6;
556 		} else {
557 			int d;
558 
559 			out[0 * 8] = workspace2[0] + workspace2[4];
560 			out[1 * 8] = workspace2[0] - workspace2[4];
561 			out[2 * 8] = workspace2[1] - workspace2[5];
562 			out[3 * 8] = workspace2[1] + workspace2[5];
563 			out[4 * 8] = workspace2[2] + workspace2[6];
564 			out[5 * 8] = workspace2[2] - workspace2[6];
565 			out[6 * 8] = workspace2[3] - workspace2[7];
566 			out[7 * 8] = workspace2[3] + workspace2[7];
567 
568 			for (d = 0; d < 8; d++) {
569 				out[8 * d] >>= 6;
570 				out[8 * d] += 128;
571 			}
572 		}
573 	}
574 }
575 
576 static void fill_encoder_block(const u8 *input, s16 *dst,
577 			       unsigned int stride, unsigned int input_step)
578 {
579 	int i, j;
580 
581 	for (i = 0; i < 8; i++) {
582 		for (j = 0; j < 8; j++, input += input_step)
583 			*dst++ = *input;
584 		input += stride - 8 * input_step;
585 	}
586 }
587 
588 static int var_intra(const s16 *input)
589 {
590 	int32_t mean = 0;
591 	int32_t ret = 0;
592 	const s16 *tmp = input;
593 	int i;
594 
595 	for (i = 0; i < 8 * 8; i++, tmp++)
596 		mean += *tmp;
597 	mean /= 64;
598 	tmp = input;
599 	for (i = 0; i < 8 * 8; i++, tmp++)
600 		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
601 	return ret;
602 }
603 
604 static int var_inter(const s16 *old, const s16 *new)
605 {
606 	int32_t ret = 0;
607 	int i;
608 
609 	for (i = 0; i < 8 * 8; i++, old++, new++)
610 		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
611 	return ret;
612 }
613 
614 static noinline_for_stack int
615 decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
616 		 unsigned int stride, unsigned int input_step)
617 {
618 	s16 tmp[64];
619 	s16 old[64];
620 	s16 *work = tmp;
621 	unsigned int k, l;
622 	int vari;
623 	int vard;
624 
625 	fill_encoder_block(cur, tmp, stride, input_step);
626 	fill_encoder_block(reference, old, 8, 1);
627 	vari = var_intra(tmp);
628 
629 	for (k = 0; k < 8; k++) {
630 		for (l = 0; l < 8; l++) {
631 			*deltablock = *work - *reference;
632 			deltablock++;
633 			work++;
634 			reference++;
635 		}
636 	}
637 	deltablock -= 64;
638 	vard = var_inter(old, tmp);
639 	return vari <= vard ? IBLOCK : PBLOCK;
640 }
641 
642 static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
643 			       unsigned int dst_step)
644 {
645 	int i, j;
646 
647 	for (i = 0; i < 8; i++) {
648 		for (j = 0; j < 8; j++, input++, dst += dst_step) {
649 			if (*input < 0)
650 				*dst = 0;
651 			else if (*input > 255)
652 				*dst = 255;
653 			else
654 				*dst = *input;
655 		}
656 		dst += stride - (8 * dst_step);
657 	}
658 }
659 
660 static void add_deltas(s16 *deltas, const u8 *ref, int stride,
661 		       unsigned int ref_step)
662 {
663 	int k, l;
664 
665 	for (k = 0; k < 8; k++) {
666 		for (l = 0; l < 8; l++) {
667 			*deltas += *ref;
668 			ref += ref_step;
669 			/*
670 			 * Due to quantizing, it might possible that the
671 			 * decoded coefficients are slightly out of range
672 			 */
673 			if (*deltas < 0)
674 				*deltas = 0;
675 			else if (*deltas > 255)
676 				*deltas = 255;
677 			deltas++;
678 		}
679 		ref += stride - (8 * ref_step);
680 	}
681 }
682 
683 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
684 			struct fwht_cframe *cf, u32 height, u32 width,
685 			u32 stride, unsigned int input_step,
686 			bool is_intra, bool next_is_intra)
687 {
688 	u8 *input_start = input;
689 	__be16 *rlco_start = *rlco;
690 	s16 deltablock[64];
691 	__be16 pframe_bit = htons(PFRAME_BIT);
692 	u32 encoding = 0;
693 	unsigned int last_size = 0;
694 	unsigned int i, j;
695 
696 	width = round_up(width, 8);
697 	height = round_up(height, 8);
698 
699 	for (j = 0; j < height / 8; j++) {
700 		input = input_start + j * 8 * stride;
701 		for (i = 0; i < width / 8; i++) {
702 			/* intra code, first frame is always intra coded. */
703 			int blocktype = IBLOCK;
704 			unsigned int size;
705 
706 			if (!is_intra)
707 				blocktype = decide_blocktype(input, refp,
708 					deltablock, stride, input_step);
709 			if (blocktype == IBLOCK) {
710 				fwht(input, cf->coeffs, stride, input_step, 1);
711 				quantize_intra(cf->coeffs, cf->de_coeffs,
712 					       cf->i_frame_qp);
713 			} else {
714 				/* inter code */
715 				encoding |= FWHT_FRAME_PCODED;
716 				fwht16(deltablock, cf->coeffs, 8, 0);
717 				quantize_inter(cf->coeffs, cf->de_coeffs,
718 					       cf->p_frame_qp);
719 			}
720 			if (!next_is_intra) {
721 				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
722 
723 				if (blocktype == PBLOCK)
724 					add_deltas(cf->de_fwht, refp, 8, 1);
725 				fill_decoder_block(refp, cf->de_fwht, 8, 1);
726 			}
727 
728 			input += 8 * input_step;
729 			refp += 8 * 8;
730 
731 			size = rlc(cf->coeffs, *rlco, blocktype);
732 			if (last_size == size &&
733 			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
734 				__be16 *last_rlco = *rlco - size;
735 				s16 hdr = ntohs(*last_rlco);
736 
737 				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
738 				    (hdr & DUPS_MASK) < DUPS_MASK)
739 					*last_rlco = htons(hdr + 2);
740 				else
741 					*rlco += size;
742 			} else {
743 				*rlco += size;
744 			}
745 			if (*rlco >= rlco_max) {
746 				encoding |= FWHT_FRAME_UNENCODED;
747 				goto exit_loop;
748 			}
749 			last_size = size;
750 		}
751 	}
752 
753 exit_loop:
754 	if (encoding & FWHT_FRAME_UNENCODED) {
755 		u8 *out = (u8 *)rlco_start;
756 		u8 *p;
757 
758 		input = input_start;
759 		/*
760 		 * The compressed stream should never contain the magic
761 		 * header, so when we copy the YUV data we replace 0xff
762 		 * by 0xfe. Since YUV is limited range such values
763 		 * shouldn't appear anyway.
764 		 */
765 		for (j = 0; j < height; j++) {
766 			for (i = 0, p = input; i < width; i++, p += input_step)
767 				*out++ = (*p == 0xff) ? 0xfe : *p;
768 			input += stride;
769 		}
770 		*rlco = (__be16 *)out;
771 		encoding &= ~FWHT_FRAME_PCODED;
772 	}
773 	return encoding;
774 }
775 
776 u32 fwht_encode_frame(struct fwht_raw_frame *frm,
777 		      struct fwht_raw_frame *ref_frm,
778 		      struct fwht_cframe *cf,
779 		      bool is_intra, bool next_is_intra,
780 		      unsigned int width, unsigned int height,
781 		      unsigned int stride, unsigned int chroma_stride)
782 {
783 	unsigned int size = height * width;
784 	__be16 *rlco = cf->rlc_data;
785 	__be16 *rlco_max;
786 	u32 encoding;
787 
788 	rlco_max = rlco + size / 2 - 256;
789 	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
790 				height, width, stride,
791 				frm->luma_alpha_step, is_intra, next_is_intra);
792 	if (encoding & FWHT_FRAME_UNENCODED)
793 		encoding |= FWHT_LUMA_UNENCODED;
794 	encoding &= ~FWHT_FRAME_UNENCODED;
795 
796 	if (frm->components_num >= 3) {
797 		u32 chroma_h = height / frm->height_div;
798 		u32 chroma_w = width / frm->width_div;
799 		unsigned int chroma_size = chroma_h * chroma_w;
800 
801 		rlco_max = rlco + chroma_size / 2 - 256;
802 		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
803 					 cf, chroma_h, chroma_w,
804 					 chroma_stride, frm->chroma_step,
805 					 is_intra, next_is_intra);
806 		if (encoding & FWHT_FRAME_UNENCODED)
807 			encoding |= FWHT_CB_UNENCODED;
808 		encoding &= ~FWHT_FRAME_UNENCODED;
809 		rlco_max = rlco + chroma_size / 2 - 256;
810 		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
811 					 cf, chroma_h, chroma_w,
812 					 chroma_stride, frm->chroma_step,
813 					 is_intra, next_is_intra);
814 		if (encoding & FWHT_FRAME_UNENCODED)
815 			encoding |= FWHT_CR_UNENCODED;
816 		encoding &= ~FWHT_FRAME_UNENCODED;
817 	}
818 
819 	if (frm->components_num == 4) {
820 		rlco_max = rlco + size / 2 - 256;
821 		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
822 					 rlco_max, cf, height, width,
823 					 stride, frm->luma_alpha_step,
824 					 is_intra, next_is_intra);
825 		if (encoding & FWHT_FRAME_UNENCODED)
826 			encoding |= FWHT_ALPHA_UNENCODED;
827 		encoding &= ~FWHT_FRAME_UNENCODED;
828 	}
829 
830 	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
831 	return encoding;
832 }
833 
834 static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
835 			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
836 			 unsigned int ref_step, u8 *dst,
837 			 unsigned int dst_stride, unsigned int dst_step,
838 			 bool uncompressed, const __be16 *end_of_rlco_buf)
839 {
840 	unsigned int copies = 0;
841 	s16 copy[8 * 8];
842 	u16 stat;
843 	unsigned int i, j;
844 	bool is_intra = !ref;
845 
846 	width = round_up(width, 8);
847 	height = round_up(height, 8);
848 
849 	if (uncompressed) {
850 		int i;
851 
852 		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
853 			return false;
854 		for (i = 0; i < height; i++) {
855 			memcpy(dst, *rlco, width);
856 			dst += dst_stride;
857 			*rlco += width / 2;
858 		}
859 		return true;
860 	}
861 
862 	/*
863 	 * When decoding each macroblock the rlco pointer will be increased
864 	 * by 65 * 2 bytes worst-case.
865 	 * To avoid overflow the buffer has to be 65/64th of the actual raw
866 	 * image size, just in case someone feeds it malicious data.
867 	 */
868 	for (j = 0; j < height / 8; j++) {
869 		for (i = 0; i < width / 8; i++) {
870 			const u8 *refp = ref + j * 8 * ref_stride +
871 				i * 8 * ref_step;
872 			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
873 
874 			if (copies) {
875 				memcpy(cf->de_fwht, copy, sizeof(copy));
876 				if ((stat & PFRAME_BIT) && !is_intra)
877 					add_deltas(cf->de_fwht, refp,
878 						   ref_stride, ref_step);
879 				fill_decoder_block(dstp, cf->de_fwht,
880 						   dst_stride, dst_step);
881 				copies--;
882 				continue;
883 			}
884 
885 			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
886 			if (stat & OVERFLOW_BIT)
887 				return false;
888 			if ((stat & PFRAME_BIT) && !is_intra)
889 				dequantize_inter(cf->coeffs);
890 			else
891 				dequantize_intra(cf->coeffs);
892 
893 			ifwht(cf->coeffs, cf->de_fwht,
894 			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
895 
896 			copies = (stat & DUPS_MASK) >> 1;
897 			if (copies)
898 				memcpy(copy, cf->de_fwht, sizeof(copy));
899 			if ((stat & PFRAME_BIT) && !is_intra)
900 				add_deltas(cf->de_fwht, refp,
901 					   ref_stride, ref_step);
902 			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
903 					   dst_step);
904 		}
905 	}
906 	return true;
907 }
908 
909 bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
910 		       unsigned int components_num, unsigned int width,
911 		       unsigned int height, const struct fwht_raw_frame *ref,
912 		       unsigned int ref_stride, unsigned int ref_chroma_stride,
913 		       struct fwht_raw_frame *dst, unsigned int dst_stride,
914 		       unsigned int dst_chroma_stride)
915 {
916 	const __be16 *rlco = cf->rlc_data;
917 	const __be16 *end_of_rlco_buf = cf->rlc_data +
918 			(cf->size / sizeof(*rlco)) - 1;
919 
920 	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
921 			  ref->luma_alpha_step, dst->luma, dst_stride,
922 			  dst->luma_alpha_step,
923 			  hdr_flags & FWHT_FL_LUMA_IS_UNCOMPRESSED,
924 			  end_of_rlco_buf))
925 		return false;
926 
927 	if (components_num >= 3) {
928 		u32 h = height;
929 		u32 w = width;
930 
931 		if (!(hdr_flags & FWHT_FL_CHROMA_FULL_HEIGHT))
932 			h /= 2;
933 		if (!(hdr_flags & FWHT_FL_CHROMA_FULL_WIDTH))
934 			w /= 2;
935 
936 		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
937 				  ref->chroma_step, dst->cb, dst_chroma_stride,
938 				  dst->chroma_step,
939 				  hdr_flags & FWHT_FL_CB_IS_UNCOMPRESSED,
940 				  end_of_rlco_buf))
941 			return false;
942 		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
943 				  ref->chroma_step, dst->cr, dst_chroma_stride,
944 				  dst->chroma_step,
945 				  hdr_flags & FWHT_FL_CR_IS_UNCOMPRESSED,
946 				  end_of_rlco_buf))
947 			return false;
948 	}
949 
950 	if (components_num == 4)
951 		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
952 				  ref->luma_alpha_step, dst->alpha, dst_stride,
953 				  dst->luma_alpha_step,
954 				  hdr_flags & FWHT_FL_ALPHA_IS_UNCOMPRESSED,
955 				  end_of_rlco_buf))
956 			return false;
957 	return true;
958 }
959