1dacca5f0SHans Verkuil // SPDX-License-Identifier: LGPL-2.1+ 2dacca5f0SHans Verkuil /* 3dacca5f0SHans Verkuil * Copyright 2016 Tom aan de Wiel 4dacca5f0SHans Verkuil * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. 5dacca5f0SHans Verkuil * 6dacca5f0SHans Verkuil * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper: 7dacca5f0SHans Verkuil * 8dacca5f0SHans Verkuil * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms, 9dacca5f0SHans Verkuil * R.D. Brown, 1977 10dacca5f0SHans Verkuil */ 11dacca5f0SHans Verkuil 12dacca5f0SHans Verkuil #include <linux/string.h> 13dacca5f0SHans Verkuil #include <linux/kernel.h> 14dacca5f0SHans Verkuil #include "codec-fwht.h" 15dacca5f0SHans Verkuil 16dacca5f0SHans Verkuil #define OVERFLOW_BIT BIT(14) 17dacca5f0SHans Verkuil 18dacca5f0SHans Verkuil /* 19dacca5f0SHans Verkuil * Note: bit 0 of the header must always be 0. Otherwise it cannot 20dacca5f0SHans Verkuil * be guaranteed that the magic 8 byte sequence (see below) can 21dacca5f0SHans Verkuil * never occur in the rlc output. 22dacca5f0SHans Verkuil */ 23dacca5f0SHans Verkuil #define PFRAME_BIT BIT(15) 24dacca5f0SHans Verkuil #define DUPS_MASK 0x1ffe 25dacca5f0SHans Verkuil 26dacca5f0SHans Verkuil #define PBLOCK 0 27dacca5f0SHans Verkuil #define IBLOCK 1 28dacca5f0SHans Verkuil 29dacca5f0SHans Verkuil #define ALL_ZEROS 15 30dacca5f0SHans Verkuil 31dacca5f0SHans Verkuil static const uint8_t zigzag[64] = { 32dacca5f0SHans Verkuil 0, 33dacca5f0SHans Verkuil 1, 8, 34dacca5f0SHans Verkuil 2, 9, 16, 35dacca5f0SHans Verkuil 3, 10, 17, 24, 36dacca5f0SHans Verkuil 4, 11, 18, 25, 32, 37dacca5f0SHans Verkuil 5, 12, 19, 26, 33, 40, 38dacca5f0SHans Verkuil 6, 13, 20, 27, 34, 41, 48, 39dacca5f0SHans Verkuil 7, 14, 21, 28, 35, 42, 49, 56, 40dacca5f0SHans Verkuil 15, 22, 29, 36, 43, 50, 57, 41dacca5f0SHans Verkuil 23, 30, 37, 44, 51, 58, 42dacca5f0SHans Verkuil 31, 38, 45, 52, 59, 43dacca5f0SHans Verkuil 39, 46, 53, 60, 44dacca5f0SHans Verkuil 47, 54, 61, 45dacca5f0SHans Verkuil 55, 62, 46dacca5f0SHans Verkuil 63, 47dacca5f0SHans Verkuil }; 48dacca5f0SHans Verkuil 49dacca5f0SHans Verkuil /* 50dacca5f0SHans Verkuil * noinline_for_stack to work around 51dacca5f0SHans Verkuil * https://bugs.llvm.org/show_bug.cgi?id=38809 52dacca5f0SHans Verkuil */ 53dacca5f0SHans Verkuil static int noinline_for_stack 54dacca5f0SHans Verkuil rlc(const s16 *in, __be16 *output, int blocktype) 55dacca5f0SHans Verkuil { 56dacca5f0SHans Verkuil s16 block[8 * 8]; 57dacca5f0SHans Verkuil s16 *wp = block; 58dacca5f0SHans Verkuil int i = 0; 59dacca5f0SHans Verkuil int x, y; 60dacca5f0SHans Verkuil int ret = 0; 61dacca5f0SHans Verkuil 62dacca5f0SHans Verkuil /* read in block from framebuffer */ 63dacca5f0SHans Verkuil int lastzero_run = 0; 64dacca5f0SHans Verkuil int to_encode; 65dacca5f0SHans Verkuil 66dacca5f0SHans Verkuil for (y = 0; y < 8; y++) { 67dacca5f0SHans Verkuil for (x = 0; x < 8; x++) { 68dacca5f0SHans Verkuil *wp = in[x + y * 8]; 69dacca5f0SHans Verkuil wp++; 70dacca5f0SHans Verkuil } 71dacca5f0SHans Verkuil } 72dacca5f0SHans Verkuil 73dacca5f0SHans Verkuil /* keep track of amount of trailing zeros */ 74dacca5f0SHans Verkuil for (i = 63; i >= 0 && !block[zigzag[i]]; i--) 75dacca5f0SHans Verkuil lastzero_run++; 76dacca5f0SHans Verkuil 77dacca5f0SHans Verkuil *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0); 78dacca5f0SHans Verkuil ret++; 79dacca5f0SHans Verkuil 80dacca5f0SHans Verkuil to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0); 81dacca5f0SHans Verkuil 82dacca5f0SHans Verkuil i = 0; 83dacca5f0SHans Verkuil while (i < to_encode) { 84dacca5f0SHans Verkuil int cnt = 0; 85dacca5f0SHans Verkuil int tmp; 86dacca5f0SHans Verkuil 87dacca5f0SHans Verkuil /* count leading zeros */ 88dacca5f0SHans Verkuil while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) { 89dacca5f0SHans Verkuil cnt++; 90dacca5f0SHans Verkuil i++; 91dacca5f0SHans Verkuil if (i == to_encode) { 92dacca5f0SHans Verkuil cnt--; 93dacca5f0SHans Verkuil break; 94dacca5f0SHans Verkuil } 95dacca5f0SHans Verkuil } 96dacca5f0SHans Verkuil /* 4 bits for run, 12 for coefficient (quantization by 4) */ 97dacca5f0SHans Verkuil *output++ = htons((cnt | tmp << 4)); 98dacca5f0SHans Verkuil i++; 99dacca5f0SHans Verkuil ret++; 100dacca5f0SHans Verkuil } 101dacca5f0SHans Verkuil if (lastzero_run > 14) { 102dacca5f0SHans Verkuil *output = htons(ALL_ZEROS | 0); 103dacca5f0SHans Verkuil ret++; 104dacca5f0SHans Verkuil } 105dacca5f0SHans Verkuil 106dacca5f0SHans Verkuil return ret; 107dacca5f0SHans Verkuil } 108dacca5f0SHans Verkuil 109dacca5f0SHans Verkuil /* 110dacca5f0SHans Verkuil * This function will worst-case increase rlc_in by 65*2 bytes: 111dacca5f0SHans Verkuil * one s16 value for the header and 8 * 8 coefficients of type s16. 112dacca5f0SHans Verkuil */ 113dacca5f0SHans Verkuil static noinline_for_stack u16 114dacca5f0SHans Verkuil derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input) 115dacca5f0SHans Verkuil { 116dacca5f0SHans Verkuil /* header */ 117dacca5f0SHans Verkuil const __be16 *input = *rlc_in; 118dacca5f0SHans Verkuil u16 stat; 119dacca5f0SHans Verkuil int dec_count = 0; 120dacca5f0SHans Verkuil s16 block[8 * 8 + 16]; 121dacca5f0SHans Verkuil s16 *wp = block; 122dacca5f0SHans Verkuil int i; 123dacca5f0SHans Verkuil 124dacca5f0SHans Verkuil if (input > end_of_input) 125dacca5f0SHans Verkuil return OVERFLOW_BIT; 126dacca5f0SHans Verkuil stat = ntohs(*input++); 127dacca5f0SHans Verkuil 128dacca5f0SHans Verkuil /* 129dacca5f0SHans Verkuil * Now de-compress, it expands one byte to up to 15 bytes 130dacca5f0SHans Verkuil * (or fills the remainder of the 64 bytes with zeroes if it 131dacca5f0SHans Verkuil * is the last byte to expand). 132dacca5f0SHans Verkuil * 133dacca5f0SHans Verkuil * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to 134dacca5f0SHans Verkuil * allow for overflow if the incoming data was malformed. 135dacca5f0SHans Verkuil */ 136dacca5f0SHans Verkuil while (dec_count < 8 * 8) { 137dacca5f0SHans Verkuil s16 in; 138dacca5f0SHans Verkuil int length; 139dacca5f0SHans Verkuil int coeff; 140dacca5f0SHans Verkuil 141dacca5f0SHans Verkuil if (input > end_of_input) 142dacca5f0SHans Verkuil return OVERFLOW_BIT; 143dacca5f0SHans Verkuil in = ntohs(*input++); 144dacca5f0SHans Verkuil length = in & 0xf; 145dacca5f0SHans Verkuil coeff = in >> 4; 146dacca5f0SHans Verkuil 147dacca5f0SHans Verkuil /* fill remainder with zeros */ 148dacca5f0SHans Verkuil if (length == 15) { 149dacca5f0SHans Verkuil for (i = 0; i < 64 - dec_count; i++) 150dacca5f0SHans Verkuil *wp++ = 0; 151dacca5f0SHans Verkuil break; 152dacca5f0SHans Verkuil } 153dacca5f0SHans Verkuil 154dacca5f0SHans Verkuil for (i = 0; i < length; i++) 155dacca5f0SHans Verkuil *wp++ = 0; 156dacca5f0SHans Verkuil *wp++ = coeff; 157dacca5f0SHans Verkuil dec_count += length + 1; 158dacca5f0SHans Verkuil } 159dacca5f0SHans Verkuil 160dacca5f0SHans Verkuil wp = block; 161dacca5f0SHans Verkuil 162dacca5f0SHans Verkuil for (i = 0; i < 64; i++) { 163dacca5f0SHans Verkuil int pos = zigzag[i]; 164dacca5f0SHans Verkuil int y = pos / 8; 165dacca5f0SHans Verkuil int x = pos % 8; 166dacca5f0SHans Verkuil 167dacca5f0SHans Verkuil dwht_out[x + y * 8] = *wp++; 168dacca5f0SHans Verkuil } 169dacca5f0SHans Verkuil *rlc_in = input; 170dacca5f0SHans Verkuil return stat; 171dacca5f0SHans Verkuil } 172dacca5f0SHans Verkuil 173dacca5f0SHans Verkuil static const int quant_table[] = { 174dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 2, 2, 175dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 2, 2, 176dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 2, 3, 177dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 2, 3, 6, 178dacca5f0SHans Verkuil 2, 2, 2, 2, 2, 3, 6, 6, 179dacca5f0SHans Verkuil 2, 2, 2, 2, 3, 6, 6, 6, 180dacca5f0SHans Verkuil 2, 2, 2, 3, 6, 6, 6, 6, 181dacca5f0SHans Verkuil 2, 2, 3, 6, 6, 6, 6, 8, 182dacca5f0SHans Verkuil }; 183dacca5f0SHans Verkuil 184dacca5f0SHans Verkuil static const int quant_table_p[] = { 185dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 3, 186dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 3, 187dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 3, 188dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 3, 6, 189dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 3, 6, 6, 190dacca5f0SHans Verkuil 3, 3, 3, 3, 3, 6, 6, 9, 191dacca5f0SHans Verkuil 3, 3, 3, 3, 6, 6, 9, 9, 192dacca5f0SHans Verkuil 3, 3, 3, 6, 6, 9, 9, 10, 193dacca5f0SHans Verkuil }; 194dacca5f0SHans Verkuil 195dacca5f0SHans Verkuil static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp) 196dacca5f0SHans Verkuil { 197dacca5f0SHans Verkuil const int *quant = quant_table; 198dacca5f0SHans Verkuil int i, j; 199dacca5f0SHans Verkuil 200dacca5f0SHans Verkuil for (j = 0; j < 8; j++) { 201dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { 202dacca5f0SHans Verkuil *coeff >>= *quant; 203dacca5f0SHans Verkuil if (*coeff >= -qp && *coeff <= qp) 204dacca5f0SHans Verkuil *coeff = *de_coeff = 0; 205dacca5f0SHans Verkuil else 206dacca5f0SHans Verkuil *de_coeff = *coeff << *quant; 207dacca5f0SHans Verkuil } 208dacca5f0SHans Verkuil } 209dacca5f0SHans Verkuil } 210dacca5f0SHans Verkuil 211dacca5f0SHans Verkuil static void dequantize_intra(s16 *coeff) 212dacca5f0SHans Verkuil { 213dacca5f0SHans Verkuil const int *quant = quant_table; 214dacca5f0SHans Verkuil int i, j; 215dacca5f0SHans Verkuil 216dacca5f0SHans Verkuil for (j = 0; j < 8; j++) 217dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++) 218dacca5f0SHans Verkuil *coeff <<= *quant; 219dacca5f0SHans Verkuil } 220dacca5f0SHans Verkuil 221dacca5f0SHans Verkuil static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp) 222dacca5f0SHans Verkuil { 223dacca5f0SHans Verkuil const int *quant = quant_table_p; 224dacca5f0SHans Verkuil int i, j; 225dacca5f0SHans Verkuil 226dacca5f0SHans Verkuil for (j = 0; j < 8; j++) { 227dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { 228dacca5f0SHans Verkuil *coeff >>= *quant; 229dacca5f0SHans Verkuil if (*coeff >= -qp && *coeff <= qp) 230dacca5f0SHans Verkuil *coeff = *de_coeff = 0; 231dacca5f0SHans Verkuil else 232dacca5f0SHans Verkuil *de_coeff = *coeff << *quant; 233dacca5f0SHans Verkuil } 234dacca5f0SHans Verkuil } 235dacca5f0SHans Verkuil } 236dacca5f0SHans Verkuil 237dacca5f0SHans Verkuil static void dequantize_inter(s16 *coeff) 238dacca5f0SHans Verkuil { 239dacca5f0SHans Verkuil const int *quant = quant_table_p; 240dacca5f0SHans Verkuil int i, j; 241dacca5f0SHans Verkuil 242dacca5f0SHans Verkuil for (j = 0; j < 8; j++) 243dacca5f0SHans Verkuil for (i = 0; i < 8; i++, quant++, coeff++) 244dacca5f0SHans Verkuil *coeff <<= *quant; 245dacca5f0SHans Verkuil } 246dacca5f0SHans Verkuil 247dacca5f0SHans Verkuil static void noinline_for_stack fwht(const u8 *block, s16 *output_block, 248dacca5f0SHans Verkuil unsigned int stride, 249dacca5f0SHans Verkuil unsigned int input_step, bool intra) 250dacca5f0SHans Verkuil { 251dacca5f0SHans Verkuil /* we'll need more than 8 bits for the transformed coefficients */ 252dacca5f0SHans Verkuil s32 workspace1[8], workspace2[8]; 253dacca5f0SHans Verkuil const u8 *tmp = block; 254dacca5f0SHans Verkuil s16 *out = output_block; 255dacca5f0SHans Verkuil int add = intra ? 256 : 0; 256dacca5f0SHans Verkuil unsigned int i; 257dacca5f0SHans Verkuil 258dacca5f0SHans Verkuil /* stage 1 */ 259dacca5f0SHans Verkuil for (i = 0; i < 8; i++, tmp += stride, out += 8) { 260dacca5f0SHans Verkuil switch (input_step) { 261dacca5f0SHans Verkuil case 1: 262dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[1] - add; 263dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[1]; 264dacca5f0SHans Verkuil 265dacca5f0SHans Verkuil workspace1[2] = tmp[2] + tmp[3] - add; 266dacca5f0SHans Verkuil workspace1[3] = tmp[2] - tmp[3]; 267dacca5f0SHans Verkuil 268dacca5f0SHans Verkuil workspace1[4] = tmp[4] + tmp[5] - add; 269dacca5f0SHans Verkuil workspace1[5] = tmp[4] - tmp[5]; 270dacca5f0SHans Verkuil 271dacca5f0SHans Verkuil workspace1[6] = tmp[6] + tmp[7] - add; 272dacca5f0SHans Verkuil workspace1[7] = tmp[6] - tmp[7]; 273dacca5f0SHans Verkuil break; 274dacca5f0SHans Verkuil case 2: 275dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[2] - add; 276dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[2]; 277dacca5f0SHans Verkuil 278dacca5f0SHans Verkuil workspace1[2] = tmp[4] + tmp[6] - add; 279dacca5f0SHans Verkuil workspace1[3] = tmp[4] - tmp[6]; 280dacca5f0SHans Verkuil 281dacca5f0SHans Verkuil workspace1[4] = tmp[8] + tmp[10] - add; 282dacca5f0SHans Verkuil workspace1[5] = tmp[8] - tmp[10]; 283dacca5f0SHans Verkuil 284dacca5f0SHans Verkuil workspace1[6] = tmp[12] + tmp[14] - add; 285dacca5f0SHans Verkuil workspace1[7] = tmp[12] - tmp[14]; 286dacca5f0SHans Verkuil break; 287dacca5f0SHans Verkuil case 3: 288dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[3] - add; 289dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[3]; 290dacca5f0SHans Verkuil 291dacca5f0SHans Verkuil workspace1[2] = tmp[6] + tmp[9] - add; 292dacca5f0SHans Verkuil workspace1[3] = tmp[6] - tmp[9]; 293dacca5f0SHans Verkuil 294dacca5f0SHans Verkuil workspace1[4] = tmp[12] + tmp[15] - add; 295dacca5f0SHans Verkuil workspace1[5] = tmp[12] - tmp[15]; 296dacca5f0SHans Verkuil 297dacca5f0SHans Verkuil workspace1[6] = tmp[18] + tmp[21] - add; 298dacca5f0SHans Verkuil workspace1[7] = tmp[18] - tmp[21]; 299dacca5f0SHans Verkuil break; 300dacca5f0SHans Verkuil default: 301dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[4] - add; 302dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[4]; 303dacca5f0SHans Verkuil 304dacca5f0SHans Verkuil workspace1[2] = tmp[8] + tmp[12] - add; 305dacca5f0SHans Verkuil workspace1[3] = tmp[8] - tmp[12]; 306dacca5f0SHans Verkuil 307dacca5f0SHans Verkuil workspace1[4] = tmp[16] + tmp[20] - add; 308dacca5f0SHans Verkuil workspace1[5] = tmp[16] - tmp[20]; 309dacca5f0SHans Verkuil 310dacca5f0SHans Verkuil workspace1[6] = tmp[24] + tmp[28] - add; 311dacca5f0SHans Verkuil workspace1[7] = tmp[24] - tmp[28]; 312dacca5f0SHans Verkuil break; 313dacca5f0SHans Verkuil } 314dacca5f0SHans Verkuil 315dacca5f0SHans Verkuil /* stage 2 */ 316dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2]; 317dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2]; 318dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3]; 319dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3]; 320dacca5f0SHans Verkuil 321dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6]; 322dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6]; 323dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7]; 324dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7]; 325dacca5f0SHans Verkuil 326dacca5f0SHans Verkuil /* stage 3 */ 327dacca5f0SHans Verkuil out[0] = workspace2[0] + workspace2[4]; 328dacca5f0SHans Verkuil out[1] = workspace2[0] - workspace2[4]; 329dacca5f0SHans Verkuil out[2] = workspace2[1] - workspace2[5]; 330dacca5f0SHans Verkuil out[3] = workspace2[1] + workspace2[5]; 331dacca5f0SHans Verkuil out[4] = workspace2[2] + workspace2[6]; 332dacca5f0SHans Verkuil out[5] = workspace2[2] - workspace2[6]; 333dacca5f0SHans Verkuil out[6] = workspace2[3] - workspace2[7]; 334dacca5f0SHans Verkuil out[7] = workspace2[3] + workspace2[7]; 335dacca5f0SHans Verkuil } 336dacca5f0SHans Verkuil 337dacca5f0SHans Verkuil out = output_block; 338dacca5f0SHans Verkuil 339dacca5f0SHans Verkuil for (i = 0; i < 8; i++, out++) { 340dacca5f0SHans Verkuil /* stage 1 */ 341dacca5f0SHans Verkuil workspace1[0] = out[0] + out[1 * 8]; 342dacca5f0SHans Verkuil workspace1[1] = out[0] - out[1 * 8]; 343dacca5f0SHans Verkuil 344dacca5f0SHans Verkuil workspace1[2] = out[2 * 8] + out[3 * 8]; 345dacca5f0SHans Verkuil workspace1[3] = out[2 * 8] - out[3 * 8]; 346dacca5f0SHans Verkuil 347dacca5f0SHans Verkuil workspace1[4] = out[4 * 8] + out[5 * 8]; 348dacca5f0SHans Verkuil workspace1[5] = out[4 * 8] - out[5 * 8]; 349dacca5f0SHans Verkuil 350dacca5f0SHans Verkuil workspace1[6] = out[6 * 8] + out[7 * 8]; 351dacca5f0SHans Verkuil workspace1[7] = out[6 * 8] - out[7 * 8]; 352dacca5f0SHans Verkuil 353dacca5f0SHans Verkuil /* stage 2 */ 354dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2]; 355dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2]; 356dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3]; 357dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3]; 358dacca5f0SHans Verkuil 359dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6]; 360dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6]; 361dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7]; 362dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7]; 363dacca5f0SHans Verkuil /* stage 3 */ 364dacca5f0SHans Verkuil out[0 * 8] = workspace2[0] + workspace2[4]; 365dacca5f0SHans Verkuil out[1 * 8] = workspace2[0] - workspace2[4]; 366dacca5f0SHans Verkuil out[2 * 8] = workspace2[1] - workspace2[5]; 367dacca5f0SHans Verkuil out[3 * 8] = workspace2[1] + workspace2[5]; 368dacca5f0SHans Verkuil out[4 * 8] = workspace2[2] + workspace2[6]; 369dacca5f0SHans Verkuil out[5 * 8] = workspace2[2] - workspace2[6]; 370dacca5f0SHans Verkuil out[6 * 8] = workspace2[3] - workspace2[7]; 371dacca5f0SHans Verkuil out[7 * 8] = workspace2[3] + workspace2[7]; 372dacca5f0SHans Verkuil } 373dacca5f0SHans Verkuil } 374dacca5f0SHans Verkuil 375dacca5f0SHans Verkuil /* 376dacca5f0SHans Verkuil * Not the nicest way of doing it, but P-blocks get twice the range of 377dacca5f0SHans Verkuil * that of the I-blocks. Therefore we need a type bigger than 8 bits. 378dacca5f0SHans Verkuil * Furthermore values can be negative... This is just a version that 379dacca5f0SHans Verkuil * works with 16 signed data 380dacca5f0SHans Verkuil */ 381dacca5f0SHans Verkuil static void noinline_for_stack 382dacca5f0SHans Verkuil fwht16(const s16 *block, s16 *output_block, int stride, int intra) 383dacca5f0SHans Verkuil { 384dacca5f0SHans Verkuil /* we'll need more than 8 bits for the transformed coefficients */ 385dacca5f0SHans Verkuil s32 workspace1[8], workspace2[8]; 386dacca5f0SHans Verkuil const s16 *tmp = block; 387dacca5f0SHans Verkuil s16 *out = output_block; 388dacca5f0SHans Verkuil int i; 389dacca5f0SHans Verkuil 390dacca5f0SHans Verkuil for (i = 0; i < 8; i++, tmp += stride, out += 8) { 391dacca5f0SHans Verkuil /* stage 1 */ 392dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[1]; 393dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[1]; 394dacca5f0SHans Verkuil 395dacca5f0SHans Verkuil workspace1[2] = tmp[2] + tmp[3]; 396dacca5f0SHans Verkuil workspace1[3] = tmp[2] - tmp[3]; 397dacca5f0SHans Verkuil 398dacca5f0SHans Verkuil workspace1[4] = tmp[4] + tmp[5]; 399dacca5f0SHans Verkuil workspace1[5] = tmp[4] - tmp[5]; 400dacca5f0SHans Verkuil 401dacca5f0SHans Verkuil workspace1[6] = tmp[6] + tmp[7]; 402dacca5f0SHans Verkuil workspace1[7] = tmp[6] - tmp[7]; 403dacca5f0SHans Verkuil 404dacca5f0SHans Verkuil /* stage 2 */ 405dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2]; 406dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2]; 407dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3]; 408dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3]; 409dacca5f0SHans Verkuil 410dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6]; 411dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6]; 412dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7]; 413dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7]; 414dacca5f0SHans Verkuil 415dacca5f0SHans Verkuil /* stage 3 */ 416dacca5f0SHans Verkuil out[0] = workspace2[0] + workspace2[4]; 417dacca5f0SHans Verkuil out[1] = workspace2[0] - workspace2[4]; 418dacca5f0SHans Verkuil out[2] = workspace2[1] - workspace2[5]; 419dacca5f0SHans Verkuil out[3] = workspace2[1] + workspace2[5]; 420dacca5f0SHans Verkuil out[4] = workspace2[2] + workspace2[6]; 421dacca5f0SHans Verkuil out[5] = workspace2[2] - workspace2[6]; 422dacca5f0SHans Verkuil out[6] = workspace2[3] - workspace2[7]; 423dacca5f0SHans Verkuil out[7] = workspace2[3] + workspace2[7]; 424dacca5f0SHans Verkuil } 425dacca5f0SHans Verkuil 426dacca5f0SHans Verkuil out = output_block; 427dacca5f0SHans Verkuil 428dacca5f0SHans Verkuil for (i = 0; i < 8; i++, out++) { 429dacca5f0SHans Verkuil /* stage 1 */ 430dacca5f0SHans Verkuil workspace1[0] = out[0] + out[1*8]; 431dacca5f0SHans Verkuil workspace1[1] = out[0] - out[1*8]; 432dacca5f0SHans Verkuil 433dacca5f0SHans Verkuil workspace1[2] = out[2*8] + out[3*8]; 434dacca5f0SHans Verkuil workspace1[3] = out[2*8] - out[3*8]; 435dacca5f0SHans Verkuil 436dacca5f0SHans Verkuil workspace1[4] = out[4*8] + out[5*8]; 437dacca5f0SHans Verkuil workspace1[5] = out[4*8] - out[5*8]; 438dacca5f0SHans Verkuil 439dacca5f0SHans Verkuil workspace1[6] = out[6*8] + out[7*8]; 440dacca5f0SHans Verkuil workspace1[7] = out[6*8] - out[7*8]; 441dacca5f0SHans Verkuil 442dacca5f0SHans Verkuil /* stage 2 */ 443dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2]; 444dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2]; 445dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3]; 446dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3]; 447dacca5f0SHans Verkuil 448dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6]; 449dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6]; 450dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7]; 451dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7]; 452dacca5f0SHans Verkuil 453dacca5f0SHans Verkuil /* stage 3 */ 454dacca5f0SHans Verkuil out[0*8] = workspace2[0] + workspace2[4]; 455dacca5f0SHans Verkuil out[1*8] = workspace2[0] - workspace2[4]; 456dacca5f0SHans Verkuil out[2*8] = workspace2[1] - workspace2[5]; 457dacca5f0SHans Verkuil out[3*8] = workspace2[1] + workspace2[5]; 458dacca5f0SHans Verkuil out[4*8] = workspace2[2] + workspace2[6]; 459dacca5f0SHans Verkuil out[5*8] = workspace2[2] - workspace2[6]; 460dacca5f0SHans Verkuil out[6*8] = workspace2[3] - workspace2[7]; 461dacca5f0SHans Verkuil out[7*8] = workspace2[3] + workspace2[7]; 462dacca5f0SHans Verkuil } 463dacca5f0SHans Verkuil } 464dacca5f0SHans Verkuil 465dacca5f0SHans Verkuil static noinline_for_stack void 466dacca5f0SHans Verkuil ifwht(const s16 *block, s16 *output_block, int intra) 467dacca5f0SHans Verkuil { 468dacca5f0SHans Verkuil /* 469dacca5f0SHans Verkuil * we'll need more than 8 bits for the transformed coefficients 470dacca5f0SHans Verkuil * use native unit of cpu 471dacca5f0SHans Verkuil */ 472dacca5f0SHans Verkuil int workspace1[8], workspace2[8]; 473dacca5f0SHans Verkuil int inter = intra ? 0 : 1; 474dacca5f0SHans Verkuil const s16 *tmp = block; 475dacca5f0SHans Verkuil s16 *out = output_block; 476dacca5f0SHans Verkuil int i; 477dacca5f0SHans Verkuil 478dacca5f0SHans Verkuil for (i = 0; i < 8; i++, tmp += 8, out += 8) { 479dacca5f0SHans Verkuil /* stage 1 */ 480dacca5f0SHans Verkuil workspace1[0] = tmp[0] + tmp[1]; 481dacca5f0SHans Verkuil workspace1[1] = tmp[0] - tmp[1]; 482dacca5f0SHans Verkuil 483dacca5f0SHans Verkuil workspace1[2] = tmp[2] + tmp[3]; 484dacca5f0SHans Verkuil workspace1[3] = tmp[2] - tmp[3]; 485dacca5f0SHans Verkuil 486dacca5f0SHans Verkuil workspace1[4] = tmp[4] + tmp[5]; 487dacca5f0SHans Verkuil workspace1[5] = tmp[4] - tmp[5]; 488dacca5f0SHans Verkuil 489dacca5f0SHans Verkuil workspace1[6] = tmp[6] + tmp[7]; 490dacca5f0SHans Verkuil workspace1[7] = tmp[6] - tmp[7]; 491dacca5f0SHans Verkuil 492dacca5f0SHans Verkuil /* stage 2 */ 493dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2]; 494dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2]; 495dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3]; 496dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3]; 497dacca5f0SHans Verkuil 498dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6]; 499dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6]; 500dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7]; 501dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7]; 502dacca5f0SHans Verkuil 503dacca5f0SHans Verkuil /* stage 3 */ 504dacca5f0SHans Verkuil out[0] = workspace2[0] + workspace2[4]; 505dacca5f0SHans Verkuil out[1] = workspace2[0] - workspace2[4]; 506dacca5f0SHans Verkuil out[2] = workspace2[1] - workspace2[5]; 507dacca5f0SHans Verkuil out[3] = workspace2[1] + workspace2[5]; 508dacca5f0SHans Verkuil out[4] = workspace2[2] + workspace2[6]; 509dacca5f0SHans Verkuil out[5] = workspace2[2] - workspace2[6]; 510dacca5f0SHans Verkuil out[6] = workspace2[3] - workspace2[7]; 511dacca5f0SHans Verkuil out[7] = workspace2[3] + workspace2[7]; 512dacca5f0SHans Verkuil } 513dacca5f0SHans Verkuil 514dacca5f0SHans Verkuil out = output_block; 515dacca5f0SHans Verkuil 516dacca5f0SHans Verkuil for (i = 0; i < 8; i++, out++) { 517dacca5f0SHans Verkuil /* stage 1 */ 518dacca5f0SHans Verkuil workspace1[0] = out[0] + out[1 * 8]; 519dacca5f0SHans Verkuil workspace1[1] = out[0] - out[1 * 8]; 520dacca5f0SHans Verkuil 521dacca5f0SHans Verkuil workspace1[2] = out[2 * 8] + out[3 * 8]; 522dacca5f0SHans Verkuil workspace1[3] = out[2 * 8] - out[3 * 8]; 523dacca5f0SHans Verkuil 524dacca5f0SHans Verkuil workspace1[4] = out[4 * 8] + out[5 * 8]; 525dacca5f0SHans Verkuil workspace1[5] = out[4 * 8] - out[5 * 8]; 526dacca5f0SHans Verkuil 527dacca5f0SHans Verkuil workspace1[6] = out[6 * 8] + out[7 * 8]; 528dacca5f0SHans Verkuil workspace1[7] = out[6 * 8] - out[7 * 8]; 529dacca5f0SHans Verkuil 530dacca5f0SHans Verkuil /* stage 2 */ 531dacca5f0SHans Verkuil workspace2[0] = workspace1[0] + workspace1[2]; 532dacca5f0SHans Verkuil workspace2[1] = workspace1[0] - workspace1[2]; 533dacca5f0SHans Verkuil workspace2[2] = workspace1[1] - workspace1[3]; 534dacca5f0SHans Verkuil workspace2[3] = workspace1[1] + workspace1[3]; 535dacca5f0SHans Verkuil 536dacca5f0SHans Verkuil workspace2[4] = workspace1[4] + workspace1[6]; 537dacca5f0SHans Verkuil workspace2[5] = workspace1[4] - workspace1[6]; 538dacca5f0SHans Verkuil workspace2[6] = workspace1[5] - workspace1[7]; 539dacca5f0SHans Verkuil workspace2[7] = workspace1[5] + workspace1[7]; 540dacca5f0SHans Verkuil 541dacca5f0SHans Verkuil /* stage 3 */ 542dacca5f0SHans Verkuil if (inter) { 543dacca5f0SHans Verkuil int d; 544dacca5f0SHans Verkuil 545dacca5f0SHans Verkuil out[0 * 8] = workspace2[0] + workspace2[4]; 546dacca5f0SHans Verkuil out[1 * 8] = workspace2[0] - workspace2[4]; 547dacca5f0SHans Verkuil out[2 * 8] = workspace2[1] - workspace2[5]; 548dacca5f0SHans Verkuil out[3 * 8] = workspace2[1] + workspace2[5]; 549dacca5f0SHans Verkuil out[4 * 8] = workspace2[2] + workspace2[6]; 550dacca5f0SHans Verkuil out[5 * 8] = workspace2[2] - workspace2[6]; 551dacca5f0SHans Verkuil out[6 * 8] = workspace2[3] - workspace2[7]; 552dacca5f0SHans Verkuil out[7 * 8] = workspace2[3] + workspace2[7]; 553dacca5f0SHans Verkuil 554dacca5f0SHans Verkuil for (d = 0; d < 8; d++) 555dacca5f0SHans Verkuil out[8 * d] >>= 6; 556dacca5f0SHans Verkuil } else { 557dacca5f0SHans Verkuil int d; 558dacca5f0SHans Verkuil 559dacca5f0SHans Verkuil out[0 * 8] = workspace2[0] + workspace2[4]; 560dacca5f0SHans Verkuil out[1 * 8] = workspace2[0] - workspace2[4]; 561dacca5f0SHans Verkuil out[2 * 8] = workspace2[1] - workspace2[5]; 562dacca5f0SHans Verkuil out[3 * 8] = workspace2[1] + workspace2[5]; 563dacca5f0SHans Verkuil out[4 * 8] = workspace2[2] + workspace2[6]; 564dacca5f0SHans Verkuil out[5 * 8] = workspace2[2] - workspace2[6]; 565dacca5f0SHans Verkuil out[6 * 8] = workspace2[3] - workspace2[7]; 566dacca5f0SHans Verkuil out[7 * 8] = workspace2[3] + workspace2[7]; 567dacca5f0SHans Verkuil 568dacca5f0SHans Verkuil for (d = 0; d < 8; d++) { 569dacca5f0SHans Verkuil out[8 * d] >>= 6; 570dacca5f0SHans Verkuil out[8 * d] += 128; 571dacca5f0SHans Verkuil } 572dacca5f0SHans Verkuil } 573dacca5f0SHans Verkuil } 574dacca5f0SHans Verkuil } 575dacca5f0SHans Verkuil 576dacca5f0SHans Verkuil static void fill_encoder_block(const u8 *input, s16 *dst, 577dacca5f0SHans Verkuil unsigned int stride, unsigned int input_step) 578dacca5f0SHans Verkuil { 579dacca5f0SHans Verkuil int i, j; 580dacca5f0SHans Verkuil 581dacca5f0SHans Verkuil for (i = 0; i < 8; i++) { 582dacca5f0SHans Verkuil for (j = 0; j < 8; j++, input += input_step) 583dacca5f0SHans Verkuil *dst++ = *input; 584dacca5f0SHans Verkuil input += stride - 8 * input_step; 585dacca5f0SHans Verkuil } 586dacca5f0SHans Verkuil } 587dacca5f0SHans Verkuil 588dacca5f0SHans Verkuil static int var_intra(const s16 *input) 589dacca5f0SHans Verkuil { 590dacca5f0SHans Verkuil int32_t mean = 0; 591dacca5f0SHans Verkuil int32_t ret = 0; 592dacca5f0SHans Verkuil const s16 *tmp = input; 593dacca5f0SHans Verkuil int i; 594dacca5f0SHans Verkuil 595dacca5f0SHans Verkuil for (i = 0; i < 8 * 8; i++, tmp++) 596dacca5f0SHans Verkuil mean += *tmp; 597dacca5f0SHans Verkuil mean /= 64; 598dacca5f0SHans Verkuil tmp = input; 599dacca5f0SHans Verkuil for (i = 0; i < 8 * 8; i++, tmp++) 600dacca5f0SHans Verkuil ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean); 601dacca5f0SHans Verkuil return ret; 602dacca5f0SHans Verkuil } 603dacca5f0SHans Verkuil 604dacca5f0SHans Verkuil static int var_inter(const s16 *old, const s16 *new) 605dacca5f0SHans Verkuil { 606dacca5f0SHans Verkuil int32_t ret = 0; 607dacca5f0SHans Verkuil int i; 608dacca5f0SHans Verkuil 609dacca5f0SHans Verkuil for (i = 0; i < 8 * 8; i++, old++, new++) 610dacca5f0SHans Verkuil ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new); 611dacca5f0SHans Verkuil return ret; 612dacca5f0SHans Verkuil } 613dacca5f0SHans Verkuil 614dacca5f0SHans Verkuil static noinline_for_stack int 615dacca5f0SHans Verkuil decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock, 616dacca5f0SHans Verkuil unsigned int stride, unsigned int input_step) 617dacca5f0SHans Verkuil { 618dacca5f0SHans Verkuil s16 tmp[64]; 619dacca5f0SHans Verkuil s16 old[64]; 620dacca5f0SHans Verkuil s16 *work = tmp; 621dacca5f0SHans Verkuil unsigned int k, l; 622dacca5f0SHans Verkuil int vari; 623dacca5f0SHans Verkuil int vard; 624dacca5f0SHans Verkuil 625dacca5f0SHans Verkuil fill_encoder_block(cur, tmp, stride, input_step); 626dacca5f0SHans Verkuil fill_encoder_block(reference, old, 8, 1); 627dacca5f0SHans Verkuil vari = var_intra(tmp); 628dacca5f0SHans Verkuil 629dacca5f0SHans Verkuil for (k = 0; k < 8; k++) { 630dacca5f0SHans Verkuil for (l = 0; l < 8; l++) { 631dacca5f0SHans Verkuil *deltablock = *work - *reference; 632dacca5f0SHans Verkuil deltablock++; 633dacca5f0SHans Verkuil work++; 634dacca5f0SHans Verkuil reference++; 635dacca5f0SHans Verkuil } 636dacca5f0SHans Verkuil } 637dacca5f0SHans Verkuil deltablock -= 64; 638dacca5f0SHans Verkuil vard = var_inter(old, tmp); 639dacca5f0SHans Verkuil return vari <= vard ? IBLOCK : PBLOCK; 640dacca5f0SHans Verkuil } 641dacca5f0SHans Verkuil 642dacca5f0SHans Verkuil static void fill_decoder_block(u8 *dst, const s16 *input, int stride, 643dacca5f0SHans Verkuil unsigned int dst_step) 644dacca5f0SHans Verkuil { 645dacca5f0SHans Verkuil int i, j; 646dacca5f0SHans Verkuil 647dacca5f0SHans Verkuil for (i = 0; i < 8; i++) { 648dacca5f0SHans Verkuil for (j = 0; j < 8; j++, input++, dst += dst_step) { 649dacca5f0SHans Verkuil if (*input < 0) 650dacca5f0SHans Verkuil *dst = 0; 651dacca5f0SHans Verkuil else if (*input > 255) 652dacca5f0SHans Verkuil *dst = 255; 653dacca5f0SHans Verkuil else 654dacca5f0SHans Verkuil *dst = *input; 655dacca5f0SHans Verkuil } 656dacca5f0SHans Verkuil dst += stride - (8 * dst_step); 657dacca5f0SHans Verkuil } 658dacca5f0SHans Verkuil } 659dacca5f0SHans Verkuil 660dacca5f0SHans Verkuil static void add_deltas(s16 *deltas, const u8 *ref, int stride, 661dacca5f0SHans Verkuil unsigned int ref_step) 662dacca5f0SHans Verkuil { 663dacca5f0SHans Verkuil int k, l; 664dacca5f0SHans Verkuil 665dacca5f0SHans Verkuil for (k = 0; k < 8; k++) { 666dacca5f0SHans Verkuil for (l = 0; l < 8; l++) { 667dacca5f0SHans Verkuil *deltas += *ref; 668dacca5f0SHans Verkuil ref += ref_step; 669dacca5f0SHans Verkuil /* 670dacca5f0SHans Verkuil * Due to quantizing, it might possible that the 671dacca5f0SHans Verkuil * decoded coefficients are slightly out of range 672dacca5f0SHans Verkuil */ 673dacca5f0SHans Verkuil if (*deltas < 0) 674dacca5f0SHans Verkuil *deltas = 0; 675dacca5f0SHans Verkuil else if (*deltas > 255) 676dacca5f0SHans Verkuil *deltas = 255; 677dacca5f0SHans Verkuil deltas++; 678dacca5f0SHans Verkuil } 679dacca5f0SHans Verkuil ref += stride - (8 * ref_step); 680dacca5f0SHans Verkuil } 681dacca5f0SHans Verkuil } 682dacca5f0SHans Verkuil 683dacca5f0SHans Verkuil static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max, 684dacca5f0SHans Verkuil struct fwht_cframe *cf, u32 height, u32 width, 685dacca5f0SHans Verkuil u32 stride, unsigned int input_step, 686dacca5f0SHans Verkuil bool is_intra, bool next_is_intra) 687dacca5f0SHans Verkuil { 688dacca5f0SHans Verkuil u8 *input_start = input; 689dacca5f0SHans Verkuil __be16 *rlco_start = *rlco; 690dacca5f0SHans Verkuil s16 deltablock[64]; 691dacca5f0SHans Verkuil __be16 pframe_bit = htons(PFRAME_BIT); 692dacca5f0SHans Verkuil u32 encoding = 0; 693dacca5f0SHans Verkuil unsigned int last_size = 0; 694dacca5f0SHans Verkuil unsigned int i, j; 695dacca5f0SHans Verkuil 696dacca5f0SHans Verkuil width = round_up(width, 8); 697dacca5f0SHans Verkuil height = round_up(height, 8); 698dacca5f0SHans Verkuil 699dacca5f0SHans Verkuil for (j = 0; j < height / 8; j++) { 700dacca5f0SHans Verkuil input = input_start + j * 8 * stride; 701dacca5f0SHans Verkuil for (i = 0; i < width / 8; i++) { 702dacca5f0SHans Verkuil /* intra code, first frame is always intra coded. */ 703dacca5f0SHans Verkuil int blocktype = IBLOCK; 704dacca5f0SHans Verkuil unsigned int size; 705dacca5f0SHans Verkuil 706dacca5f0SHans Verkuil if (!is_intra) 707dacca5f0SHans Verkuil blocktype = decide_blocktype(input, refp, 708dacca5f0SHans Verkuil deltablock, stride, input_step); 709dacca5f0SHans Verkuil if (blocktype == IBLOCK) { 710dacca5f0SHans Verkuil fwht(input, cf->coeffs, stride, input_step, 1); 711dacca5f0SHans Verkuil quantize_intra(cf->coeffs, cf->de_coeffs, 712dacca5f0SHans Verkuil cf->i_frame_qp); 713dacca5f0SHans Verkuil } else { 714dacca5f0SHans Verkuil /* inter code */ 715dacca5f0SHans Verkuil encoding |= FWHT_FRAME_PCODED; 716dacca5f0SHans Verkuil fwht16(deltablock, cf->coeffs, 8, 0); 717dacca5f0SHans Verkuil quantize_inter(cf->coeffs, cf->de_coeffs, 718dacca5f0SHans Verkuil cf->p_frame_qp); 719dacca5f0SHans Verkuil } 720dacca5f0SHans Verkuil if (!next_is_intra) { 721dacca5f0SHans Verkuil ifwht(cf->de_coeffs, cf->de_fwht, blocktype); 722dacca5f0SHans Verkuil 723dacca5f0SHans Verkuil if (blocktype == PBLOCK) 724dacca5f0SHans Verkuil add_deltas(cf->de_fwht, refp, 8, 1); 725dacca5f0SHans Verkuil fill_decoder_block(refp, cf->de_fwht, 8, 1); 726dacca5f0SHans Verkuil } 727dacca5f0SHans Verkuil 728dacca5f0SHans Verkuil input += 8 * input_step; 729dacca5f0SHans Verkuil refp += 8 * 8; 730dacca5f0SHans Verkuil 731dacca5f0SHans Verkuil size = rlc(cf->coeffs, *rlco, blocktype); 732dacca5f0SHans Verkuil if (last_size == size && 733dacca5f0SHans Verkuil !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) { 734dacca5f0SHans Verkuil __be16 *last_rlco = *rlco - size; 735dacca5f0SHans Verkuil s16 hdr = ntohs(*last_rlco); 736dacca5f0SHans Verkuil 737dacca5f0SHans Verkuil if (!((*last_rlco ^ **rlco) & pframe_bit) && 738dacca5f0SHans Verkuil (hdr & DUPS_MASK) < DUPS_MASK) 739dacca5f0SHans Verkuil *last_rlco = htons(hdr + 2); 740dacca5f0SHans Verkuil else 741dacca5f0SHans Verkuil *rlco += size; 742dacca5f0SHans Verkuil } else { 743dacca5f0SHans Verkuil *rlco += size; 744dacca5f0SHans Verkuil } 745dacca5f0SHans Verkuil if (*rlco >= rlco_max) { 746dacca5f0SHans Verkuil encoding |= FWHT_FRAME_UNENCODED; 747dacca5f0SHans Verkuil goto exit_loop; 748dacca5f0SHans Verkuil } 749dacca5f0SHans Verkuil last_size = size; 750dacca5f0SHans Verkuil } 751dacca5f0SHans Verkuil } 752dacca5f0SHans Verkuil 753dacca5f0SHans Verkuil exit_loop: 754dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED) { 755dacca5f0SHans Verkuil u8 *out = (u8 *)rlco_start; 756dacca5f0SHans Verkuil u8 *p; 757dacca5f0SHans Verkuil 758dacca5f0SHans Verkuil input = input_start; 759dacca5f0SHans Verkuil /* 760dacca5f0SHans Verkuil * The compressed stream should never contain the magic 761dacca5f0SHans Verkuil * header, so when we copy the YUV data we replace 0xff 762dacca5f0SHans Verkuil * by 0xfe. Since YUV is limited range such values 763dacca5f0SHans Verkuil * shouldn't appear anyway. 764dacca5f0SHans Verkuil */ 765dacca5f0SHans Verkuil for (j = 0; j < height; j++) { 766dacca5f0SHans Verkuil for (i = 0, p = input; i < width; i++, p += input_step) 767dacca5f0SHans Verkuil *out++ = (*p == 0xff) ? 0xfe : *p; 768dacca5f0SHans Verkuil input += stride; 769dacca5f0SHans Verkuil } 770dacca5f0SHans Verkuil *rlco = (__be16 *)out; 771dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_PCODED; 772dacca5f0SHans Verkuil } 773dacca5f0SHans Verkuil return encoding; 774dacca5f0SHans Verkuil } 775dacca5f0SHans Verkuil 776dacca5f0SHans Verkuil u32 fwht_encode_frame(struct fwht_raw_frame *frm, 777dacca5f0SHans Verkuil struct fwht_raw_frame *ref_frm, 778dacca5f0SHans Verkuil struct fwht_cframe *cf, 779dacca5f0SHans Verkuil bool is_intra, bool next_is_intra, 780dacca5f0SHans Verkuil unsigned int width, unsigned int height, 781dacca5f0SHans Verkuil unsigned int stride, unsigned int chroma_stride) 782dacca5f0SHans Verkuil { 783dacca5f0SHans Verkuil unsigned int size = height * width; 784dacca5f0SHans Verkuil __be16 *rlco = cf->rlc_data; 785dacca5f0SHans Verkuil __be16 *rlco_max; 786dacca5f0SHans Verkuil u32 encoding; 787dacca5f0SHans Verkuil 788dacca5f0SHans Verkuil rlco_max = rlco + size / 2 - 256; 789dacca5f0SHans Verkuil encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf, 790dacca5f0SHans Verkuil height, width, stride, 791dacca5f0SHans Verkuil frm->luma_alpha_step, is_intra, next_is_intra); 792dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED) 793dacca5f0SHans Verkuil encoding |= FWHT_LUMA_UNENCODED; 794dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED; 795dacca5f0SHans Verkuil 796dacca5f0SHans Verkuil if (frm->components_num >= 3) { 797dacca5f0SHans Verkuil u32 chroma_h = height / frm->height_div; 798dacca5f0SHans Verkuil u32 chroma_w = width / frm->width_div; 799dacca5f0SHans Verkuil unsigned int chroma_size = chroma_h * chroma_w; 800dacca5f0SHans Verkuil 801dacca5f0SHans Verkuil rlco_max = rlco + chroma_size / 2 - 256; 802dacca5f0SHans Verkuil encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, 803dacca5f0SHans Verkuil cf, chroma_h, chroma_w, 804dacca5f0SHans Verkuil chroma_stride, frm->chroma_step, 805dacca5f0SHans Verkuil is_intra, next_is_intra); 806dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED) 807dacca5f0SHans Verkuil encoding |= FWHT_CB_UNENCODED; 808dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED; 809dacca5f0SHans Verkuil rlco_max = rlco + chroma_size / 2 - 256; 810dacca5f0SHans Verkuil encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, 811dacca5f0SHans Verkuil cf, chroma_h, chroma_w, 812dacca5f0SHans Verkuil chroma_stride, frm->chroma_step, 813dacca5f0SHans Verkuil is_intra, next_is_intra); 814dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED) 815dacca5f0SHans Verkuil encoding |= FWHT_CR_UNENCODED; 816dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED; 817dacca5f0SHans Verkuil } 818dacca5f0SHans Verkuil 819dacca5f0SHans Verkuil if (frm->components_num == 4) { 820dacca5f0SHans Verkuil rlco_max = rlco + size / 2 - 256; 821dacca5f0SHans Verkuil encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco, 822dacca5f0SHans Verkuil rlco_max, cf, height, width, 823dacca5f0SHans Verkuil stride, frm->luma_alpha_step, 824dacca5f0SHans Verkuil is_intra, next_is_intra); 825dacca5f0SHans Verkuil if (encoding & FWHT_FRAME_UNENCODED) 826dacca5f0SHans Verkuil encoding |= FWHT_ALPHA_UNENCODED; 827dacca5f0SHans Verkuil encoding &= ~FWHT_FRAME_UNENCODED; 828dacca5f0SHans Verkuil } 829dacca5f0SHans Verkuil 830dacca5f0SHans Verkuil cf->size = (rlco - cf->rlc_data) * sizeof(*rlco); 831dacca5f0SHans Verkuil return encoding; 832dacca5f0SHans Verkuil } 833dacca5f0SHans Verkuil 834dacca5f0SHans Verkuil static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco, 835dacca5f0SHans Verkuil u32 height, u32 width, const u8 *ref, u32 ref_stride, 836dacca5f0SHans Verkuil unsigned int ref_step, u8 *dst, 837dacca5f0SHans Verkuil unsigned int dst_stride, unsigned int dst_step, 838dacca5f0SHans Verkuil bool uncompressed, const __be16 *end_of_rlco_buf) 839dacca5f0SHans Verkuil { 840dacca5f0SHans Verkuil unsigned int copies = 0; 841dacca5f0SHans Verkuil s16 copy[8 * 8]; 842dacca5f0SHans Verkuil u16 stat; 843dacca5f0SHans Verkuil unsigned int i, j; 844dacca5f0SHans Verkuil bool is_intra = !ref; 845dacca5f0SHans Verkuil 846dacca5f0SHans Verkuil width = round_up(width, 8); 847dacca5f0SHans Verkuil height = round_up(height, 8); 848dacca5f0SHans Verkuil 849dacca5f0SHans Verkuil if (uncompressed) { 850dacca5f0SHans Verkuil int i; 851dacca5f0SHans Verkuil 852dacca5f0SHans Verkuil if (end_of_rlco_buf + 1 < *rlco + width * height / 2) 853dacca5f0SHans Verkuil return false; 854dacca5f0SHans Verkuil for (i = 0; i < height; i++) { 855dacca5f0SHans Verkuil memcpy(dst, *rlco, width); 856dacca5f0SHans Verkuil dst += dst_stride; 857dacca5f0SHans Verkuil *rlco += width / 2; 858dacca5f0SHans Verkuil } 859dacca5f0SHans Verkuil return true; 860dacca5f0SHans Verkuil } 861dacca5f0SHans Verkuil 862dacca5f0SHans Verkuil /* 863dacca5f0SHans Verkuil * When decoding each macroblock the rlco pointer will be increased 864dacca5f0SHans Verkuil * by 65 * 2 bytes worst-case. 865dacca5f0SHans Verkuil * To avoid overflow the buffer has to be 65/64th of the actual raw 866dacca5f0SHans Verkuil * image size, just in case someone feeds it malicious data. 867dacca5f0SHans Verkuil */ 868dacca5f0SHans Verkuil for (j = 0; j < height / 8; j++) { 869dacca5f0SHans Verkuil for (i = 0; i < width / 8; i++) { 870dacca5f0SHans Verkuil const u8 *refp = ref + j * 8 * ref_stride + 871dacca5f0SHans Verkuil i * 8 * ref_step; 872dacca5f0SHans Verkuil u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step; 873dacca5f0SHans Verkuil 874dacca5f0SHans Verkuil if (copies) { 875dacca5f0SHans Verkuil memcpy(cf->de_fwht, copy, sizeof(copy)); 876dacca5f0SHans Verkuil if ((stat & PFRAME_BIT) && !is_intra) 877dacca5f0SHans Verkuil add_deltas(cf->de_fwht, refp, 878dacca5f0SHans Verkuil ref_stride, ref_step); 879dacca5f0SHans Verkuil fill_decoder_block(dstp, cf->de_fwht, 880dacca5f0SHans Verkuil dst_stride, dst_step); 881dacca5f0SHans Verkuil copies--; 882dacca5f0SHans Verkuil continue; 883dacca5f0SHans Verkuil } 884dacca5f0SHans Verkuil 885dacca5f0SHans Verkuil stat = derlc(rlco, cf->coeffs, end_of_rlco_buf); 886dacca5f0SHans Verkuil if (stat & OVERFLOW_BIT) 887dacca5f0SHans Verkuil return false; 888dacca5f0SHans Verkuil if ((stat & PFRAME_BIT) && !is_intra) 889dacca5f0SHans Verkuil dequantize_inter(cf->coeffs); 890dacca5f0SHans Verkuil else 891dacca5f0SHans Verkuil dequantize_intra(cf->coeffs); 892dacca5f0SHans Verkuil 893dacca5f0SHans Verkuil ifwht(cf->coeffs, cf->de_fwht, 894dacca5f0SHans Verkuil ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1); 895dacca5f0SHans Verkuil 896dacca5f0SHans Verkuil copies = (stat & DUPS_MASK) >> 1; 897dacca5f0SHans Verkuil if (copies) 898dacca5f0SHans Verkuil memcpy(copy, cf->de_fwht, sizeof(copy)); 899dacca5f0SHans Verkuil if ((stat & PFRAME_BIT) && !is_intra) 900dacca5f0SHans Verkuil add_deltas(cf->de_fwht, refp, 901dacca5f0SHans Verkuil ref_stride, ref_step); 902dacca5f0SHans Verkuil fill_decoder_block(dstp, cf->de_fwht, dst_stride, 903dacca5f0SHans Verkuil dst_step); 904dacca5f0SHans Verkuil } 905dacca5f0SHans Verkuil } 906dacca5f0SHans Verkuil return true; 907dacca5f0SHans Verkuil } 908dacca5f0SHans Verkuil 909dacca5f0SHans Verkuil bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags, 910dacca5f0SHans Verkuil unsigned int components_num, unsigned int width, 911dacca5f0SHans Verkuil unsigned int height, const struct fwht_raw_frame *ref, 912dacca5f0SHans Verkuil unsigned int ref_stride, unsigned int ref_chroma_stride, 913dacca5f0SHans Verkuil struct fwht_raw_frame *dst, unsigned int dst_stride, 914dacca5f0SHans Verkuil unsigned int dst_chroma_stride) 915dacca5f0SHans Verkuil { 916dacca5f0SHans Verkuil const __be16 *rlco = cf->rlc_data; 917dacca5f0SHans Verkuil const __be16 *end_of_rlco_buf = cf->rlc_data + 918dacca5f0SHans Verkuil (cf->size / sizeof(*rlco)) - 1; 919dacca5f0SHans Verkuil 920dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride, 921dacca5f0SHans Verkuil ref->luma_alpha_step, dst->luma, dst_stride, 922dacca5f0SHans Verkuil dst->luma_alpha_step, 923*3abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED, 924dacca5f0SHans Verkuil end_of_rlco_buf)) 925dacca5f0SHans Verkuil return false; 926dacca5f0SHans Verkuil 927dacca5f0SHans Verkuil if (components_num >= 3) { 928dacca5f0SHans Verkuil u32 h = height; 929dacca5f0SHans Verkuil u32 w = width; 930dacca5f0SHans Verkuil 931*3abfc314SHans Verkuil if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT)) 932dacca5f0SHans Verkuil h /= 2; 933*3abfc314SHans Verkuil if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH)) 934dacca5f0SHans Verkuil w /= 2; 935dacca5f0SHans Verkuil 936dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride, 937dacca5f0SHans Verkuil ref->chroma_step, dst->cb, dst_chroma_stride, 938dacca5f0SHans Verkuil dst->chroma_step, 939*3abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED, 940dacca5f0SHans Verkuil end_of_rlco_buf)) 941dacca5f0SHans Verkuil return false; 942dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride, 943dacca5f0SHans Verkuil ref->chroma_step, dst->cr, dst_chroma_stride, 944dacca5f0SHans Verkuil dst->chroma_step, 945*3abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED, 946dacca5f0SHans Verkuil end_of_rlco_buf)) 947dacca5f0SHans Verkuil return false; 948dacca5f0SHans Verkuil } 949dacca5f0SHans Verkuil 950dacca5f0SHans Verkuil if (components_num == 4) 951dacca5f0SHans Verkuil if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride, 952dacca5f0SHans Verkuil ref->luma_alpha_step, dst->alpha, dst_stride, 953dacca5f0SHans Verkuil dst->luma_alpha_step, 954*3abfc314SHans Verkuil hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED, 955dacca5f0SHans Verkuil end_of_rlco_buf)) 956dacca5f0SHans Verkuil return false; 957dacca5f0SHans Verkuil return true; 958dacca5f0SHans Verkuil } 959